server : do not speculate during prompt processing

ggml-ci
2026-04-16 16:27:32 +03:00 · 2024-12-03 10:58:43 +02:00
280 changed files with 34665 additions and 32895 deletions
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -1,81 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-FROM ubuntu:$UBUNTU_VERSION AS build
-
-RUN apt-get update && \
-    apt-get install -y build-essential git cmake libcurl4-openssl-dev
-
-WORKDIR /app
-
-COPY . .
-
-RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
-    cmake --build build -j $(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    && pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -1,94 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.6.0
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-
-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
-
-# CUDA architecture to build for (defaults to all supported archs)
-ARG CUDA_DOCKER_ARCH=default
-
-RUN apt-get update && \
-    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
-
-WORKDIR /app
-
-COPY . .
-
-RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
-    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM ${BASE_CUDA_RUN_CONTAINER} AS base
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    && pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -0,0 +1,33 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=12.6.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Use the default CUDA archs if not specified
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc) && \
+    cp build/bin/* .
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-musa.Dockerfile
+++ b/.devops/full-musa.Dockerfile
@@ -0,0 +1,33 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+
+# MUSA architecture to build for (defaults to all supported archs)
+ARG MUSA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Use the default MUSA archs if not specified
+RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc) && \
+    cp build/bin/* .
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -0,0 +1,50 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH="\
+    gfx803 \
+    gfx900 \
+    gfx906 \
+    gfx908 \
+    gfx90a \
+    gfx1010 \
+    gfx1030 \
+    gfx1100 \
+    gfx1101 \
+    gfx1102"
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV GGML_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+
+# Enable cURL
+ENV LLAMA_CURL=1
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
+RUN make -j$(nproc)
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -0,0 +1,25 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+ENV LLAMA_CURL=1
+
+
+RUN make -j$(nproc)
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,91 +0,0 @@
-ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
-
-## Build Image
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
-
-ARG GGML_SYCL_F16=OFF
-RUN apt-get update && \
-    apt-get install -y git libcurl4-openssl-dev
-
-WORKDIR /app
-
-COPY . .
-
-RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
-        echo "GGML_SYCL_F16 is set" \
-        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
-    fi && \
-    echo "Building with dynamic libs" && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
-    cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/lib/ /app
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    && pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama-cli /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
-
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@@ -0,0 +1,38 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=12.6.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the CUDA runtime image
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake
+
+WORKDIR /app
+
+COPY . .
+
+# Use the default CUDA archs if not specified
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-cli -j$(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libgomp1
+
+COPY --from=build /app/lib/ /
+COPY --from=build /app/build/bin/llama-cli /
+
+ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -0,0 +1,28 @@
+ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
+
+ARG GGML_SYCL_F16=OFF
+RUN apt-get update && \
+    apt-get install -y git
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
+        echo "GGML_SYCL_F16 is set" && \
+        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
+    fi && \
+    echo "Building with static libs" && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
+    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
+    cmake --build build --config Release --target llama-cli
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
+
+COPY --from=build /app/build/bin/llama-cli /llama-cli
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-musa.Dockerfile
+++ b/.devops/llama-cli-musa.Dockerfile
@@ -0,0 +1,38 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the MUSA runtime image
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+
+# MUSA architecture to build for (defaults to all supported archs)
+ARG MUSA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake
+
+WORKDIR /app
+
+COPY . .
+
+# Use the default MUSA archs if not specified
+RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-cli -j$(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libgomp1
+
+COPY --from=build /app/lib/ /
+COPY --from=build /app/build/bin/llama-cli /llama-cli
+
+ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@@ -0,0 +1,45 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH="\
+    gfx803 \
+    gfx900 \
+    gfx906 \
+    gfx908 \
+    gfx90a \
+    gfx1010 \
+    gfx1030 \
+    gfx1100 \
+    gfx1101 \
+    gfx1102"
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV GGML_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+
+RUN make -j$(nproc) llama-cli
+
+ENTRYPOINT [ "/app/llama-cli" ]
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@@ -0,0 +1,27 @@
+ARG UBUNTU_VERSION=jammy
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+# Install build tools
+RUN apt update && apt install -y git build-essential cmake wget libgomp1
+
+# Install Vulkan SDK
+RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+    apt update -y && \
+    apt-get install -y vulkan-sdk
+
+# Build it
+WORKDIR /app
+COPY . .
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 && \
+    cmake --build build --config Release --target llama-cli
+
+# Clean up
+WORKDIR /
+RUN cp /app/build/bin/llama-cli /llama-cli && \
+    rm -rf /app
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli.Dockerfile
+++ b/.devops/llama-cli.Dockerfile
@@ -0,0 +1,23 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential git
+
+WORKDIR /app
+
+COPY . .
+
+RUN make -j$(nproc) llama-cli
+
+FROM ubuntu:$UBUNTU_VERSION AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libgomp1
+
+COPY --from=build /app/llama-cli /llama-cli
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@@ -0,0 +1,43 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=12.6.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the CUDA runtime image
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+# Use the default CUDA archs if not specified
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-server -j$(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
+
+COPY --from=build /app/lib/ /
+COPY --from=build /app/build/bin/llama-server /llama-server
+
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@@ -0,0 +1,34 @@
+ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
+
+ARG GGML_SYCL_F16=OFF
+RUN apt-get update && \
+    apt-get install -y git libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
+        echo "GGML_SYCL_F16 is set" && \
+        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
+    fi && \
+    echo "Building with dynamic libs" && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake --build build --config Release --target llama-server
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev curl
+
+COPY --from=build /app/build/bin/llama-server /llama-server
+
+ENV LC_ALL=C.utf8
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-musa.Dockerfile
+++ b/.devops/llama-server-musa.Dockerfile
@@ -0,0 +1,43 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the MUSA runtime image
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+
+# MUSA architecture to build for (defaults to all supported archs)
+ARG MUSA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+# Use the default MUSA archs if not specified
+RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-server -j$(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
+
+COPY --from=build /app/lib/ /
+COPY --from=build /app/build/bin/llama-server /llama-server
+
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@@ -0,0 +1,54 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH="\
+    gfx803 \
+    gfx900 \
+    gfx906 \
+    gfx908 \
+    gfx90a \
+    gfx1010 \
+    gfx1030 \
+    gfx1100 \
+    gfx1101 \
+    gfx1102"
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV GGML_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+# Enable cURL
+ENV LLAMA_CURL=1
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev curl
+
+RUN make -j$(nproc) llama-server
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@@ -0,0 +1,31 @@
+ARG UBUNTU_VERSION=jammy
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+# Install build tools
+RUN apt update && apt install -y git build-essential cmake wget
+
+# Install Vulkan SDK and cURL
+RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+    apt update -y && \
+    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
+
+# Build it
+WORKDIR /app
+COPY . .
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
+    cmake --build build --config Release --target llama-server
+
+# Clean up
+WORKDIR /
+RUN cp /app/build/bin/llama-server /llama-server && \
+    rm -rf /app
+
+ENV LC_ALL=C.utf8
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@@ -0,0 +1,41 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+
+RUN \
+    # Build multiple versions of the CPU backend
+    scripts/build-cpu.sh avx         -DGGML_AVX=ON -DGGML_AVX2=OFF && \
+    scripts/build-cpu.sh avx2        -DGGML_AVX=ON -DGGML_AVX2=ON && \
+    scripts/build-cpu.sh avx512      -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \
+    scripts/build-cpu.sh amx         -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \
+    # Build llama-server
+    cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build --target llama-server -j $(nproc) && \
+    # Copy the built libraries to /app/lib
+    mkdir -p /app/lib && \
+    mv libggml-cpu* /app/lib/ && \
+    find build -name "*.so" -exec cp {} /app/lib/ \;
+
+FROM ubuntu:$UBUNTU_VERSION AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
+
+COPY --from=build /app/build/bin/llama-server /llama-server
+COPY --from=build /app/lib/ /
+
+ENV LC_ALL=C.utf8
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/llama-server" ]
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -1,108 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc3.1.0
-# Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-
-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_MUSA_DEV_CONTAINER} AS build
-
-# MUSA architecture to build for (defaults to all supported archs)
-ARG MUSA_DOCKER_ARCH=default
-
-RUN apt-get update && \
-    apt-get install -y \
-    build-essential \
-    cmake \
-    python3 \
-    python3-pip \
-    git \
-    libcurl4-openssl-dev \
-    libgomp1
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Use the default MUSA archs if not specified
-RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
-        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM ${BASE_MUSA_RUN_CONTAINER} AS base
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    && pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -31,7 +31,6 @@
  # Increases the runtime closure size by ~700M
  useMpi ? false,
  useRocm ? config.rocmSupport,
-  rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
  enableCurl ? true,
  useVulkan ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@@ -189,7 +188,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
    ]
    ++ optionals useRocm [
      (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
-      (cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
+      (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
    ]
    ++ optionals useMetalKit [
      (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -1,113 +0,0 @@
-ARG UBUNTU_VERSION=24.04
-
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=6.3
-ARG AMDGPU_VERSION=6.3
-
-# Target the CUDA build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-
-### Build image
-FROM ${BASE_ROCM_DEV_CONTAINER} AS build
-
-# Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
-# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
-# gfx906 is deprecated
-#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
-
-#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
-ARG ROCM_DOCKER_ARCH=gfx1100
-
-# Set nvcc architectured
-ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-# ENV CC=/opt/rocm/llvm/bin/clang
-# ENV CXX=/opt/rocm/llvm/bin/clang++
-
-RUN apt-get update \
-    && apt-get install -y \
-    build-essential \
-    cmake \
-    git \
-    libcurl4-openssl-dev \
-    curl \
-    libgomp1
-
-WORKDIR /app
-
-COPY . .
-
-RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-    cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
-    && cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib \
-    && find build -name "*.so" -exec cp {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM ${BASE_ROCM_DEV_CONTAINER} AS base
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3-pip \
-    python3 \
-    python3-wheel\
-    && pip install --break-system-packages --upgrade setuptools \
-    && pip install --break-system-packages -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -8,11 +8,11 @@ arg1="$1"
 shift

 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    exec python3 ./convert_hf_to_gguf.py "$@"
+    python3 ./convert_hf_to_gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    exec ./llama-quantize "$@"
+    ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    exec ./llama-cli "$@"
+    ./llama-cli "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
@@ -20,11 +20,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
+            ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
        fi
    done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    exec ./llama-server "$@"
+    ./llama-server "$@"
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -1,88 +0,0 @@
-ARG UBUNTU_VERSION=jammy
-
-FROM ubuntu:$UBUNTU_VERSION AS build
-
-# Install build tools
-RUN apt update && apt install -y git build-essential cmake wget
-
-# Install Vulkan SDK and cURL
-RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
-    apt update -y && \
-    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
-
-# Build it
-WORKDIR /app
-
-COPY . .
-
-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
-    cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM ubuntu:$UBUNTU_VERSION AS base
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    && pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -317,7 +317,7 @@ jobs:
          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk
+          sudo apt-get install -y build-essential vulkan-sdk

      - name: Build
        id: cmake_build
@@ -327,12 +327,6 @@ jobs:
          cmake -DGGML_VULKAN=ON ..
          cmake --build . --config Release -j $(nproc)

-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
  ubuntu-22-cmake-hip:
    runs-on: ubuntu-22.04
    container: rocm/dev-ubuntu-22.04:6.0.2
@@ -558,44 +552,35 @@ jobs:
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO

-  macOS-latest-swift:
-    runs-on: macos-latest
-
-    strategy:
-      matrix:
-        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build llama.cpp with CMake
-        id: cmake_build
-        run: |
-          sysctl -a
-          mkdir build
-          cd build
-          cmake -G Xcode .. \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
-          sudo cmake --install . --config Release
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}"
+# TODO: tmp disabled. see for possible re-enable:
+#       https://github.com/ggerganov/llama.cpp/pull/10525
+#  macOS-latest-swift:
+#    runs-on: macos-latest
+#
+#    strategy:
+#      matrix:
+#        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v4
+#
+#      - name: Dependencies
+#        id: depends
+#        continue-on-error: true
+#        run: |
+#          brew update
+#
+#      - name: xcodebuild for swift package
+#        id: xcodebuild
+#        run: |
+#          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
+#
+#      - name: Build Swift Example
+#        id: make_build_swift_example
+#        run: |
+#            make swift

  windows-msys2:
    runs-on: windows-latest
@@ -668,8 +653,6 @@ jobs:
            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'msvc-arm64'
            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'llvm-arm64-opencl-adreno'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'

    steps:
      - name: Clone
@@ -711,28 +694,6 @@ jobs:
        run: |
          choco install ninja

-      - name: Install OpenCL Headers and Libs
-        id: install_opencl
-        if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
-        run: |
-          git clone https://github.com/KhronosGroup/OpenCL-Headers
-          cd OpenCL-Headers
-          mkdir build && cd build
-          cmake .. `
-            -DBUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build . --target install
-          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
-          cd OpenCL-ICD-Loader
-          mkdir build-arm64-release && cd build-arm64-release
-          cmake .. `
-            -A arm64 `
-            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build . --target install --config release
-
      - name: Build
        id: cmake_build
        run: |
@@ -762,7 +723,7 @@ jobs:
      - name: Test
        id: cmake_test
        # not all machines have native AVX-512
-        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
+        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
        run: |
          cd build
          ctest -L main -C Release --verbose --timeout 900
@@ -1143,29 +1104,6 @@ jobs:
      - name: Checkout code
        uses: actions/checkout@v4

-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          mkdir build
-          cd build
-          cmake -G Xcode .. \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-          sudo cmake --install . --config Release
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          xcodebuild -scheme llama-Package -destination 'generic/platform=iOS'
-
      - name: Build Xcode project
        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build

@@ -1193,6 +1131,23 @@ jobs:

          ./gradlew build --no-daemon

+#  freeBSD-latest:
+#    runs-on: macos-12
+#    steps:
+#    - name: Clone
+#      uses: actions/checkout@v4
+#
+#    - name: Build
+#      uses: cross-platform-actions/action@v0.19.0
+#      with:
+#        operating_system: freebsd
+#        version: '13.2'
+#        hypervisor: 'qemu'
+#        run: |
+#            sudo pkg update
+#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
+#            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
+
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -34,14 +34,21 @@ jobs:
    strategy:
      matrix:
        config:
-          # Multi-stage build
-          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
+          #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
@@ -49,10 +56,10 @@ jobs:
          fetch-depth: 0 # preserve git history, so we can determine the build number

      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
+        uses: docker/setup-qemu-action@v2

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v2

      - name: Log in to Docker Hub
        uses: docker/login-action@v2
@@ -72,34 +79,25 @@ jobs:

          # determine tag name postfix (build number, commit hash)
          if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
-            TAG_POSTFIX="-b${BUILD_NUMBER}"
+            TAG_POSTFIX="b${BUILD_NUMBER}"
          else
            SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
-            TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
+            TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}"
          fi
+
          # list all tags possible
-          if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
-              TYPE=""
-          else
-              TYPE="-${{ matrix.config.tag }}"
-          fi
-          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
-          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
-          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
-          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
-          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
-          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
-          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
-          echo "full_output_tags=$FULLTAGS"  # print out for debugging
-          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
-          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
+          TAGS=""
+          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }},"
+          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}"
+
+          echo "output_tags=$TAGS" >> $GITHUB_OUTPUT
+          echo "output_tags=$TAGS"  # print out for debugging
        env:
          GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'

      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
      - name: Free Disk Space (Ubuntu)
-        if: ${{ matrix.config.free_disk_space == true }}
        uses: jlumbroso/free-disk-space@main
        with:
          # this might remove tools that are actually needed,
@@ -115,59 +113,13 @@ jobs:
          docker-images: true
          swap-storage: true

-      - name: Build and push Full Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
+      - name: Build and push Docker image (tagged + versioned)
+        if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.full_output_tags }}
+          tags: ${{ steps.tag.outputs.output_tags }}
          file: ${{ matrix.config.dockerfile }}
-          target: full
-          provenance: false
-          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-
-      - name: Build and push Light Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.light_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: light
-          provenance: false
-          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-
-      - name: Build and push Server Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.server_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: server
-          provenance: false
-          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -76,26 +76,20 @@ jobs:
        run: |
          pip install -r examples/server/tests/requirements.txt

-      # Setup nodejs (to be used for verifying bundled index.html)
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '22.11.0'
-
-      - name: Verify bundled index.html
-        id: verify_server_index_html
+      - name: Verify server deps
+        id: verify_server_deps
        run: |
          git config --global --add safe.directory $(realpath .)
-          cd examples/server/webui
+          cd examples/server
+          git ls-files --others --modified
          git status
-          npm ci
-          npm run build
+          ./deps.sh
          git status
-          modified_files="$(git status -s)"
-          echo "Modified files: ${modified_files}"
-          if [ -n "${modified_files}" ]; then
-            echo "Repository is dirty or server/webui is not built as expected"
-            echo "Hint: You may need to follow Web UI build guide in server/README.md"
-            echo "${modified_files}"
+          not_ignored_files="$(git ls-files --others --modified)"
+          echo "Modified files: ${not_ignored_files}"
+          if [ -n "${not_ignored_files}" ]; then
+            echo "Repository is dirty or server deps are not built as expected"
+            echo "${not_ignored_files}"
            exit 1
          fi

--- a/.gitignore
+++ b/.gitignore
@@ -104,10 +104,6 @@ examples/server/*.mjs.hpp
 !examples/sycl/*.bat
 !examples/sycl/*.sh

-# Server Web UI temporary files
-node_modules
-examples/server/webui/dist
-
 # Python

 /.venv
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,9 +46,11 @@ if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
 endif()

-if (MSVC)
-    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
-    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/source-charset:utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/source-charset:utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/execution-charset:utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/execution-charset:utf-8>")
 endif()

 #
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -31,13 +31,6 @@
    { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16":    "ON" } },
    { "name": "vulkan",   "hidden": true, "cacheVariables": { "GGML_VULKAN":      "ON" } },

-    {
-        "name": "x64-windows-llvm", "hidden": true,
-        "cacheVariables": {
-            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
-        }
-    },
-
    {
        "name": "arm64-windows-msvc", "hidden": true,
        "architecture": { "value": "arm64",    "strategy": "external" },
@@ -77,11 +70,6 @@
    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },

-    { "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
-    { "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
-    { "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
-    { "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
-
    { "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
--- a/4
+++ b/4
@@ -1,5 +1,3 @@
 # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs

-/ci/ @ggerganov
-/.devops/ @ngxson
-/examples/server/ @ngxson
+ci/ @ggerganov
--- a/50
+++ b/50
@@ -22,7 +22,6 @@ BUILD_TARGETS = \
 	llama-infill \
 	llama-llava-cli \
 	llama-minicpmv-cli\
-	llama-qwen2vl-cli\
 	llama-lookahead \
 	llama-lookup \
 	llama-lookup-create \
@@ -446,10 +445,6 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	MK_CFLAGS     += -march=native -mtune=native
 	HOST_CXXFLAGS += -march=native -mtune=native

-	# Usage AMX build test
-	#MK_CFLAGS     += -march=graniterapids -mtune=graniterapids
-	#HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids
-
 	# Usage AVX-only
 	#MK_CFLAGS   += -mfma -mf16c -mavx
 	#MK_CXXFLAGS += -mfma -mf16c -mavx
@@ -953,6 +948,7 @@ DIR_COMMON = common

 OBJ_GGML = \
 	$(DIR_GGML)/src/ggml.o \
+	$(DIR_GGML)/src/ggml-aarch64.o \
 	$(DIR_GGML)/src/ggml-alloc.o \
 	$(DIR_GGML)/src/ggml-backend.o \
 	$(DIR_GGML)/src/ggml-backend-reg.o \
@@ -960,11 +956,9 @@ OBJ_GGML = \
 	$(DIR_GGML)/src/ggml-quants.o \
 	$(DIR_GGML)/src/ggml-threading.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
-	$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
-	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
-	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
 	$(OBJ_GGML_EXT)

 OBJ_LLAMA = \
@@ -1104,10 +1098,17 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
 # Default target
 all: $(BUILD_TARGETS)

-# force c++ build for source file that have same name as c file
 # Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
-$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp
-	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
+#       g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
+$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
+	ggml/src/ggml-cpu/ggml-cpu.cpp \
+	ggml/include/ggml-backend.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-alloc.h \
+	ggml/src/ggml-backend-impl.h \
+	ggml/include/ggml-cpu.h \
+	ggml/src/ggml-impl.h
+	$(CXX) $(CXXFLAGS)   -c $< -o $@

 # Rules for building object files
 $(DIR_GGML)/%.o: $(DIR_GGML)/%.c
@@ -1144,15 +1145,8 @@ $(LIB_COMMON_S): $(OBJ_COMMON)
 # Include dependency files
 -include $(DEP_FILES)

-# Clean generated server assets
-clean-server-assets:
-	find examples/server -type f -name "*.js.hpp"   -delete
-	find examples/server -type f -name "*.mjs.hpp"  -delete
-	find examples/server -type f -name "*.css.hpp"  -delete
-	find examples/server -type f -name "*.html.hpp" -delete
-
 # Clean rule
-clean: clean-server-assets
+clean:
 	rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
 	rm -rvf *.a *.dll *.so *.dot
 	find ggml src common tests examples pocs -type f -name "*.o" -delete
@@ -1360,14 +1354,20 @@ llama-server: \
 	examples/server/utils.hpp \
 	examples/server/httplib.h \
 	examples/server/index.html.hpp \
+	examples/server/completion.js.hpp \
 	examples/server/loading.html.hpp \
+	examples/server/deps_daisyui.min.css.hpp \
+	examples/server/deps_markdown-it.js.hpp \
+	examples/server/deps_tailwindcss.js.hpp \
+	examples/server/deps_vue.esm-browser.js.hpp \
 	common/json.hpp \
+	common/stb_image.h \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)

 # Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
-examples/server/%.hpp: examples/server/public/% FORCE Makefile
+examples/server/%.hpp: examples/server/public/% Makefile
 	@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
 		echo "unsigned char $${NAME}[] = {" && \
 		cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
@@ -1405,14 +1405,6 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

-llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
-	examples/llava/llava.cpp \
-	examples/llava/llava.h \
-	examples/llava/clip.cpp \
-	examples/llava/clip.h \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
-
 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
 	(cd examples/batched.swift; make build)
@@ -1550,7 +1542,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
 # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
 #
 # Mark legacy binary targets as .PHONY so that they are always checked.
-.PHONY: FORCE main quantize perplexity embedding server
+.PHONY: main quantize perplexity embedding server

 # Define the object file target
 examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
--- a/Package.swift
+++ b/Package.swift
@@ -2,6 +2,59 @@

 import PackageDescription

+var sources = [
+    "src/llama.cpp",
+    "src/llama-vocab.cpp",
+    "src/llama-grammar.cpp",
+    "src/llama-sampling.cpp",
+    "src/unicode.cpp",
+    "src/unicode-data.cpp",
+    "ggml/src/ggml.c",
+    "ggml/src/ggml-aarch64.c",
+    "ggml/src/ggml-alloc.c",
+    "ggml/src/ggml-backend.cpp",
+    "ggml/src/ggml-backend-reg.cpp",
+    "ggml/src/ggml-cpu/ggml-cpu.c",
+    "ggml/src/ggml-cpu/ggml-cpu.cpp",
+    "ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
+    "ggml/src/ggml-cpu/ggml-cpu-quants.c",
+    "ggml/src/ggml-threading.cpp",
+    "ggml/src/ggml-quants.c",
+]
+
+var resources: [Resource] = []
+var linkerSettings: [LinkerSetting] = []
+var cSettings: [CSetting] =  [
+    .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
+    .unsafeFlags(["-fno-objc-arc"]),
+    .headerSearchPath("ggml/src"),
+    .headerSearchPath("ggml/src/ggml-cpu"),
+    // NOTE: NEW_LAPACK will required iOS version 16.4+
+    // We should consider add this in the future when we drop support for iOS 14
+    // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
+    // .define("ACCELERATE_NEW_LAPACK"),
+    // .define("ACCELERATE_LAPACK_ILP64")
+    .define("GGML_USE_CPU"),
+]
+
+
+#if canImport(Darwin)
+sources.append("ggml/src/ggml-common.h")
+sources.append("ggml/src/ggml-metal/ggml-metal.m")
+resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal"))
+linkerSettings.append(.linkedFramework("Accelerate"))
+cSettings.append(
+    contentsOf: [
+        .define("GGML_USE_ACCELERATE"),
+        .define("GGML_USE_METAL"),
+    ]
+)
+#endif
+
+#if os(Linux)
+    cSettings.append(.define("_GNU_SOURCE"))
+#endif
+
 let package = Package(
    name: "llama",
    platforms: [
@@ -14,6 +67,26 @@ let package = Package(
        .library(name: "llama", targets: ["llama"]),
    ],
    targets: [
-        .systemLibrary(name: "llama", pkgConfig: "llama"),
-    ]
+        .target(
+            name: "llama",
+            path: ".",
+            exclude: [
+               "build",
+               "cmake",
+               "examples",
+               "scripts",
+               "models",
+               "tests",
+               "CMakeLists.txt",
+               "Makefile",
+               "ggml/src/ggml-metal-embed.metal"
+            ],
+            sources: sources,
+            resources: resources,
+            publicHeadersPath: "spm-headers",
+            cSettings: cSettings,
+            linkerSettings: linkerSettings
+        )
+    ],
+    cxxLanguageStandard: .cxx17
 )
--- a/README.md
+++ b/README.md
@@ -98,7 +98,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
 - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
 - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)

 #### Multimodal

@@ -111,7 +110,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
 - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
 - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)

 </details>

@@ -221,7 +219,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
 | [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
 | [CUDA](docs/build.md#cuda) | Nvidia GPU |
-| [HIP](docs/build.md#hip) | AMD GPU |
+| [hipBLAS](docs/build.md#hipblas) | AMD GPU |
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |

@@ -414,7 +412,7 @@ To learn more about model quantization, [read this documentation](examples/quant
 [^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
 [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)

-## [`llama-bench`](examples/llama-bench)
+## [`llama-bench`](example/bench)

 #### Benchmark the performance of the inference for various parameters.

@@ -435,20 +433,6 @@ To learn more about model quantization, [read this documentation](examples/quant

    </details>

-## [`llama-run`](examples/run)
-
-#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
-
- <details>
-    <summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
-
-    ```bash
-    llama-run granite-code
-    ```
-
-    </details>
-
-[^3]: [RamaLama](https://github.com/containers/ramalama)

 ## [`llama-simple`](examples/simple)

--- a/Sources/llama/llama.h
+++ b/Sources/llama/llama.h
@@ -1,4 +0,0 @@
-#pragma once
-
-#include <llama.h>
-
--- a/Sources/llama/module.modulemap
+++ b/Sources/llama/module.modulemap
@@ -1,5 +0,0 @@
-module llama [system] {
-    header "llama.h"
-    link "llama"
-    export *
-}
--- a/cmake/llama.pc.in
+++ b/cmake/llama.pc.in
@@ -6,5 +6,5 @@ includedir=${prefix}/include
 Name: llama
 Description: Port of Facebook's LLaMA model in C/C++
 Version: @PROJECT_VERSION@
-Libs: -L${libdir} -lggml  -lggml-base -lllama
+Libs: -L${libdir} -lllama
 Cflags: -I${includedir}
--- a/cmake/x64-windows-llvm.cmake
+++ b/cmake/x64-windows-llvm.cmake
@@ -1,11 +0,0 @@
-set( CMAKE_SYSTEM_NAME Windows )
-set( CMAKE_SYSTEM_PROCESSOR x86_64 )
-
-set( CMAKE_C_COMPILER    clang )
-set( CMAKE_CXX_COMPILER  clang++ )
-
-set( arch_c_flags "-march=native" )
-
-set( CMAKE_C_FLAGS_INIT   "${arch_c_flags}" )
-set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
-
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -81,7 +81,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
 # Use curl to download model url
 if (LLAMA_CURL)
    find_package(CURL REQUIRED)
-    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
+    add_definitions(-DLLAMA_USE_CURL)
    include_directories(${CURL_INCLUDE_DIRS})
    find_library(CURL_LIBRARY curl REQUIRED)
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -119,65 +119,32 @@ std::string common_arg::to_string() {
 // utils
 //

-static void common_params_handle_model_default(
-        std::string & model,
-        std::string & model_url,
-        std::string & hf_repo,
-        std::string & hf_file) {
-    if (!hf_repo.empty()) {
+static void common_params_handle_model_default(common_params & params) {
+    if (!params.hf_repo.empty()) {
        // short-hand to avoid specifying --hf-file -> default it to --model
-        if (hf_file.empty()) {
-            if (model.empty()) {
+        if (params.hf_file.empty()) {
+            if (params.model.empty()) {
                throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
            }
-            hf_file = model;
-        } else if (model.empty()) {
+            params.hf_file = params.model;
+        } else if (params.model.empty()) {
            // this is to avoid different repo having same file name, or same file name in different subdirs
-            std::string filename = hf_repo + "_" + hf_file;
+            std::string filename = params.hf_repo + "_" + params.hf_file;
            // to make sure we don't have any slashes in the filename
            string_replace_all(filename, "/", "_");
-            model = fs_get_cache_file(filename);
+            params.model = fs_get_cache_file(filename);
        }
-    } else if (!model_url.empty()) {
-        if (model.empty()) {
-            auto f = string_split<std::string>(model_url, '#').front();
+    } else if (!params.model_url.empty()) {
+        if (params.model.empty()) {
+            auto f = string_split<std::string>(params.model_url, '#').front();
            f = string_split<std::string>(f, '?').front();
-            model = fs_get_cache_file(string_split<std::string>(f, '/').back());
+            params.model = fs_get_cache_file(string_split<std::string>(f, '/').back());
        }
-    } else if (model.empty()) {
-        model = DEFAULT_MODEL_PATH;
+    } else if (params.model.empty()) {
+        params.model = DEFAULT_MODEL_PATH;
    }
 }

-const std::vector<ggml_type> kv_cache_types = {
-    GGML_TYPE_F32,
-    GGML_TYPE_F16,
-    GGML_TYPE_BF16,
-    GGML_TYPE_Q8_0,
-    GGML_TYPE_Q4_0,
-    GGML_TYPE_Q4_1,
-    GGML_TYPE_IQ4_NL,
-    GGML_TYPE_Q5_0,
-    GGML_TYPE_Q5_1,
-};
-
-static ggml_type kv_cache_type_from_str(const std::string & s) {
-    for (const auto & type : kv_cache_types) {
-        if (ggml_type_name(type) == s) {
-            return type;
-        }
-    }
-    throw std::runtime_error("Unsupported cache type: " + s);
-}
-
-static std::string get_all_kv_cache_types() {
-    std::ostringstream msg;
-    for (const auto & type : kv_cache_types) {
-        msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
-    }
-    return msg.str();
-}
-
 //
 // CLI argument parsing functions
 //
@@ -280,9 +247,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }

-    // TODO: refactor model params in a common struct
-    common_params_handle_model_default(params.model,         params.model_url,         params.hf_repo,         params.hf_file);
-    common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file);
+    common_params_handle_model_default(params);

    if (params.escape) {
        string_process_escapes(params.prompt);
@@ -626,7 +591,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.ctx_shift = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
    add_opt(common_arg(
        {"--chunks"}, "N",
        string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -821,7 +786,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.warmup = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"--spm-infill"},
        string_format(
@@ -848,7 +813,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_sparam());
    add_opt(common_arg(
-        {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
+        {"--sampling-seq"}, "SEQUENCE",
        string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
        [](common_params & params, const std::string & value) {
            params.sampling.samplers = common_sampler_types_from_chars(value);
@@ -861,6 +826,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.sampling.ignore_eos = true;
        }
    ).set_sparam());
+    add_opt(common_arg(
+        {"--penalize-nl"},
+        string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
+        [](common_params & params) {
+            params.sampling.penalize_nl = true;
+        }
+    ).set_sparam());
    add_opt(common_arg(
        {"--temp"}, "N",
        string_format("temperature (default: %.1f)", (double)params.sampling.temp),
@@ -915,9 +887,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--repeat-last-n"}, "N",
        string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
        [](common_params & params, int value) {
-            if (value < -1) {
-                throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
-            }
            params.sampling.penalty_last_n = value;
            params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
        }
@@ -972,9 +941,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--dry-penalty-last-n"}, "N",
        string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
        [](common_params & params, int value) {
-            if (value < -1) {
-                throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
-            }
            params.sampling.dry_penalty_last_n = value;
        }
    ).set_sparam());
@@ -1208,28 +1174,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
    add_opt(common_arg(
        {"-ctk", "--cache-type-k"}, "TYPE",
-        string_format(
-            "KV cache data type for K\n"
-            "allowed values: %s\n"
-            "(default: %s)",
-            get_all_kv_cache_types().c_str(),
-            ggml_type_name(params.cache_type_k)
-        ),
+        string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
        [](common_params & params, const std::string & value) {
-            params.cache_type_k = kv_cache_type_from_str(value);
+            // TODO: get the type right here
+            params.cache_type_k = value;
        }
    ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
    add_opt(common_arg(
        {"-ctv", "--cache-type-v"}, "TYPE",
-        string_format(
-            "KV cache data type for V\n"
-            "allowed values: %s\n"
-            "(default: %s)",
-            get_all_kv_cache_types().c_str(),
-            ggml_type_name(params.cache_type_v)
-        ),
+        string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
        [](common_params & params, const std::string & value) {
-            params.cache_type_v = kv_cache_type_from_str(value);
+            // TODO: get the type right here
+            params.cache_type_v = value;
        }
    ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
    add_opt(common_arg(
@@ -1587,20 +1543,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.hf_file = value;
        }
    ).set_env("LLAMA_ARG_HF_FILE"));
-    add_opt(common_arg(
-        {"-hfrv", "--hf-repo-v"}, "REPO",
-        "Hugging Face model repository for the vocoder model (default: unused)",
-        [](common_params & params, const std::string & value) {
-            params.vocoder.hf_repo = value;
-        }
-    ).set_env("LLAMA_ARG_HF_REPO_V"));
-    add_opt(common_arg(
-        {"-hffv", "--hf-file-v"}, "FILE",
-        "Hugging Face model file for the vocoder model (default: unused)",
-        [](common_params & params, const std::string & value) {
-            params.vocoder.hf_file = value;
-        }
-    ).set_env("LLAMA_ARG_HF_FILE_V"));
    add_opt(common_arg(
        {"-hft", "--hf-token"}, "TOKEN",
        "Hugging Face access token (default: value from HF_TOKEN environment variable)",
@@ -1769,13 +1711,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.public_path = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
-    add_opt(common_arg(
-        {"--no-webui"},
-        string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.webui = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
    add_opt(common_arg(
        {"--embedding", "--embeddings"},
        string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@@ -2141,35 +2076,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, int value) {
            params.speculative.n_max = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--draft-min", "--draft-n-min"}, "N",
        string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
        [](common_params & params, int value) {
            params.speculative.n_min = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--draft-p-split"}, "P",
        string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
        [](common_params & params, const std::string & value) {
            params.speculative.p_split = std::stof(value);
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--draft-p-min"}, "P",
        string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
        [](common_params & params, const std::string & value) {
            params.speculative.p_min = std::stof(value);
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-cd", "--ctx-size-draft"}, "N",
        string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
        [](common_params & params, int value) {
            params.speculative.n_ctx = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-devd", "--device-draft"}, "<dev1,dev2,..>",
        "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -2189,34 +2124,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-md", "--model-draft"}, "FNAME",
        "draft model for speculative decoding (default: unused)",
        [](common_params & params, const std::string & value) {
            params.speculative.model = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
-
-    add_opt(common_arg(
-        {"-mv", "--model-vocoder"}, "FNAME",
-        "vocoder model for audio generation (default: unused)",
-        [](common_params & params, const std::string & value) {
-            params.vocoder.model = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
-
-    // model-specific
-    add_opt(common_arg(
-        {"--tts-oute-default"},
-        string_format("use default OuteTTS models (note: can download weights from the internet)"),
-        [](common_params & params) {
-            params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
-            params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
-            params.vocoder.hf_repo = "ggml-org/WavTokenizer";
-            params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
-        }
-    ).set_examples({LLAMA_EXAMPLE_TTS}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));

    return ctx_arg;
 }
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -940,25 +940,6 @@ struct common_init_result common_init_from_params(common_params & params) {
        params.sampling.ignore_eos = false;
    }

-    if (params.sampling.ignore_eos) {
-        for (llama_token i = 0; i < llama_n_vocab(model); i++) {
-            if (llama_token_is_eog(model, i)) {
-                LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
-                params.sampling.logit_bias.push_back({i, -INFINITY});
-            }
-        }
-    }
-
-    if (params.sampling.penalty_last_n == -1) {
-        LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
-        params.sampling.penalty_last_n = llama_n_ctx(lctx);
-    }
-
-    if (params.sampling.dry_penalty_last_n == -1) {
-        LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
-        params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
-    }
-
    if (params.warmup) {
        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

@@ -1034,6 +1015,38 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    return mparams;
 }

+static ggml_type kv_cache_type_from_str(const std::string & s) {
+    if (s == "f32") {
+        return GGML_TYPE_F32;
+    }
+    if (s == "f16") {
+        return GGML_TYPE_F16;
+    }
+    if (s == "bf16") {
+        return GGML_TYPE_BF16;
+    }
+    if (s == "q8_0") {
+        return GGML_TYPE_Q8_0;
+    }
+    if (s == "q4_0") {
+        return GGML_TYPE_Q4_0;
+    }
+    if (s == "q4_1") {
+        return GGML_TYPE_Q4_1;
+    }
+    if (s == "iq4_nl") {
+        return GGML_TYPE_IQ4_NL;
+    }
+    if (s == "q5_0") {
+        return GGML_TYPE_Q5_0;
+    }
+    if (s == "q5_1") {
+        return GGML_TYPE_Q5_1;
+    }
+
+    throw std::runtime_error("Unsupported cache type: " + s);
+}
+
 struct llama_context_params common_context_params_to_llama(const common_params & params) {
    auto cparams = llama_context_default_params();

@@ -1068,8 +1081,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
        cparams.pooling_type  = LLAMA_POOLING_TYPE_RANK;
    }

-    cparams.type_k = params.cache_type_k;
-    cparams.type_v = params.cache_type_v;
+    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
+    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);

    return cparams;
 }
@@ -1095,7 +1108,13 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
 #define CURL_MAX_RETRY 3
 #define CURL_RETRY_DELAY_SECONDS 2

-static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
+
+static bool starts_with(const std::string & str, const std::string & prefix) {
+    // While we wait for C++20's std::string::starts_with...
+    return str.rfind(prefix, 0) == 0;
+}
+
+static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
    int remaining_attempts = max_attempts;

    while (remaining_attempts > 0) {
@@ -1119,6 +1138,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
 }

 static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
+
    // Initialize libcurl
    std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
    if (!curl) {
@@ -1191,13 +1211,11 @@ static bool common_download_file(const std::string & url, const std::string & pa
        std::string etag;
        std::string last_modified;
    };
-
    common_load_model_from_url_headers headers;
-
    {
        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
+            common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;

            static std::regex header_regex("([^:]+): (.*)\r\n");
            static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1781,9 +1799,7 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
            break;
        case 0: // max absolute
            for (int i = 0; i < n; i++) {
-                if (sum < std::abs(inp[i])) {
-                    sum = std::abs(inp[i]);
-                }
+                if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
            }
            sum /= 32760.0; // make an int16 range
            break;
--- a/common/common.h
+++ b/common/common.h
@@ -37,9 +37,9 @@ using llama_tokens = std::vector<llama_token>;

 // build info
 extern int LLAMA_BUILD_NUMBER;
-extern const char * LLAMA_COMMIT;
-extern const char * LLAMA_COMPILER;
-extern const char * LLAMA_BUILD_TARGET;
+extern char const * LLAMA_COMMIT;
+extern char const * LLAMA_COMPILER;
+extern char const * LLAMA_BUILD_TARGET;

 struct common_control_vector_load_info;

@@ -80,7 +80,6 @@ enum llama_example {
    LLAMA_EXAMPLE_LLAVA,
    LLAMA_EXAMPLE_LOOKUP,
    LLAMA_EXAMPLE_PARALLEL,
-    LLAMA_EXAMPLE_TTS,

    LLAMA_EXAMPLE_COUNT,
 };
@@ -96,7 +95,6 @@ enum common_sampler_type {
    COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
    COMMON_SAMPLER_TYPE_XTC         = 8,
    COMMON_SAMPLER_TYPE_INFILL      = 9,
-    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
 };

 // dimensionality reduction methods, used by cvector-generator
@@ -132,6 +130,7 @@ struct common_params_sampling {
    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float   mirostat_tau       = 5.00f; // target entropy
    float   mirostat_eta       = 0.10f; // learning rate
+    bool    penalize_nl        = false; // consider newlines as a repeatable token
    bool    ignore_eos         = false;
    bool    no_perf            = false; // disable performance metrics
    bool    timing_per_token   = false;
@@ -140,7 +139,6 @@ struct common_params_sampling {


    std::vector<enum common_sampler_type> samplers = {
-        COMMON_SAMPLER_TYPE_PENALTIES,
        COMMON_SAMPLER_TYPE_DRY,
        COMMON_SAMPLER_TYPE_TOP_K,
        COMMON_SAMPLER_TYPE_TYPICAL_P,
@@ -160,7 +158,6 @@ struct common_params_sampling {

 struct common_params_speculative {
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
-
    int32_t n_ctx        =     0; // draft context size
    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
    int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
@@ -174,14 +171,6 @@ struct common_params_speculative {
    std::string model = ""; // draft model for speculative decoding                          // NOLINT
 };

-struct common_params_vocoder {
-    std::string hf_repo = ""; // HF repo                                                     // NOLINT
-    std::string hf_file = ""; // HF file                                                     // NOLINT
-
-    std::string model     = ""; // model path                                                // NOLINT
-    std::string model_url = ""; // model url to download                                     // NOLINT
-};
-
 struct common_params {
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =  4096; // context size
@@ -204,13 +193,11 @@ struct common_params {
    float   defrag_thold          =  0.1f; // KV cache defragmentation threshold

    // offload params
-    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
-
-    int32_t n_gpu_layers      = -1;  // number of layers to store in VRAM (-1 - use default)
-    int32_t main_gpu          = 0;   // the GPU that is used for scratch and small tensors
-    float   tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
-
-    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    std::vector<ggml_backend_dev_t> devices;         // devices to use for offloading
+    int32_t n_gpu_layers                    =    -1; // number of layers to store in VRAM (-1 - use default)
+    int32_t main_gpu                        =     0; // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]               =   {0}; // how split tensors should be distributed across GPUs
+    enum llama_split_mode        split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;
@@ -224,12 +211,11 @@ struct common_params {
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings

-    struct common_params_sampling    sampling;
+    struct common_params_sampling sampling;
    struct common_params_speculative speculative;
-    struct common_params_vocoder     vocoder;

    std::string model                = ""; // model path                                                    // NOLINT
-    std::string model_alias          = ""; // model alias                                                   // NOLINT
+    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
    std::string model_url            = ""; // model url to download                                         // NOLINT
    std::string hf_token             = ""; // HF token                                                      // NOLINT
    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
@@ -300,8 +286,8 @@ struct common_params {
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data

-    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
-    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
+    std::string cache_type_k = "f16"; // KV cache data type for the K
+    std::string cache_type_v = "f16"; // KV cache data type for the V

    // multimodal models (see examples/llava)
    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
@@ -451,11 +437,6 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
    return parts;
 }

-static bool string_starts_with(const std::string & str,
-                               const std::string & prefix) {  // While we wait for C++20's std::string::starts_with...
-    return str.rfind(prefix, 0) == 0;
-}
-
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);

@@ -607,8 +588,7 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
 // Embedding utils
 //

-// TODO: repace embd_norm with an enum
-void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);

 float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);

--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -161,20 +161,32 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                params.logit_bias.size(),
                params.logit_bias.data()));

+    llama_sampler_chain_add(result->chain,
+            llama_sampler_init_penalties(
+                llama_n_vocab  (model),
+                llama_token_eos(model),
+                llama_token_nl (model),
+                params.penalty_last_n,
+                params.penalty_repeat,
+                params.penalty_freq,
+                params.penalty_present,
+                params.penalize_nl,
+                params.ignore_eos));
+
    if (params.mirostat == 0) {
        for (const auto & cnstr : params.samplers) {
            switch (cnstr) {
-                case COMMON_SAMPLER_TYPE_DRY:
+                    case COMMON_SAMPLER_TYPE_DRY:
                    {
-                        std::vector<const char *> c_breakers;
+                        std::vector<const char*> c_breakers;
                        c_breakers.reserve(params.dry_sequence_breakers.size());
-                        for (const auto & str : params.dry_sequence_breakers) {
+                        for (const auto& str : params.dry_sequence_breakers) {
                            c_breakers.push_back(str.c_str());
                        }

                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
                    }
-                    break;
+                        break;
                case COMMON_SAMPLER_TYPE_TOP_K:
                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
                    break;
@@ -196,9 +208,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                case COMMON_SAMPLER_TYPE_INFILL:
                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
                    break;
-                case COMMON_SAMPLER_TYPE_PENALTIES:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
-                    break;
                default:
                    GGML_ASSERT(false && "unknown sampler type");
            }
@@ -406,7 +415,6 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
        case COMMON_SAMPLER_TYPE_XTC:         return 'x';
        case COMMON_SAMPLER_TYPE_INFILL:      return 'i';
-        case COMMON_SAMPLER_TYPE_PENALTIES:   return 'e';
        default : return '?';
    }
 }
@@ -421,7 +429,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
        case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
        case COMMON_SAMPLER_TYPE_INFILL:      return "infill";
-        case COMMON_SAMPLER_TYPE_PENALTIES:   return "penalties";
        default : return "";
    }
 }
@@ -436,7 +443,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
        { "xtc",         COMMON_SAMPLER_TYPE_XTC },
        { "infill",      COMMON_SAMPLER_TYPE_INFILL },
-        { "penalties",   COMMON_SAMPLER_TYPE_PENALTIES },
    };

    // since samplers names are written multiple ways
@@ -483,7 +489,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL),      COMMON_SAMPLER_TYPE_INFILL },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES),   COMMON_SAMPLER_TYPE_PENALTIES },
    };

    std::vector<common_sampler_type> samplers;
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -62,10 +62,6 @@ struct common_speculative * common_speculative_init(
 }

 void common_speculative_free(struct common_speculative * spec) {
-    if (spec == nullptr) {
-        return;
-    }
-
    common_sampler_free(spec->smpl);

    llama_batch_free(spec->batch);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -221,17 +221,17 @@ class Model:
            self.gguf_writer.add_context_length(n_ctx)
            logger.info(f"gguf: context length = {n_ctx}")

-        if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
-            self.gguf_writer.add_embedding_length(n_embd)
-            logger.info(f"gguf: embedding length = {n_embd}")
+        n_embd = self.find_hparam(["hidden_size", "n_embd"])
+        self.gguf_writer.add_embedding_length(n_embd)
+        logger.info(f"gguf: embedding length = {n_embd}")

        if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
            logger.info(f"gguf: feed forward length = {n_ff}")

-        if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
-            self.gguf_writer.add_head_count(n_head)
-            logger.info(f"gguf: head count = {n_head}")
+        n_head = self.find_hparam(["num_attention_heads", "n_head"])
+        self.gguf_writer.add_head_count(n_head)
+        logger.info(f"gguf: head count = {n_head}")

        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
            self.gguf_writer.add_head_count_kv(n_head_kv)
@@ -296,9 +296,7 @@ class Model:
                    break

            for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
-                # TODO: why do we squeeze here?
-                # data = data_torch.squeeze().numpy()
-                data = data_torch.numpy()
+                data = data_torch.squeeze().numpy()

                # if data ends up empty, it means data_torch was a scalar tensor -> restore
                if len(data.shape) == 0:
@@ -326,8 +324,6 @@ class Model:
                            gguf.MODEL_TENSOR.TIME_MIX_W2,
                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
-                            gguf.MODEL_TENSOR.POSNET_NORM1,
-                            gguf.MODEL_TENSOR.POSNET_NORM2,
                        )
                    )
                    or not new_name.endswith(".weight")
@@ -529,19 +525,9 @@ class Model:
            else:
                token: str = reverse_vocab[i]
                if token in added_vocab:
-                    # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
-                    # To avoid unexpected issues - we make sure to normalize non-normalized tokens
-                    if not tokenizer.added_tokens_decoder[i].normalized:
-                        previous_token = token
-                        token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
-                        if previous_token != token:
-                            logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
-
                    if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
                        toktypes.append(gguf.TokenType.CONTROL)
                    else:
-                        # NOTE: this was added for Gemma.
-                        # Encoding and decoding the tokens above isn't sufficient for this case.
                        token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")  # pre-normalize user-defined spaces
                        toktypes.append(gguf.TokenType.USER_DEFINED)
                else:
@@ -585,9 +571,6 @@ class Model:
        if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
            # ref: https://huggingface.co/tiiuae/falcon-7b
            res = "falcon"
-        if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
-            # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
-            res = "falcon3"
        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
            # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
            res = "bert-bge"
@@ -675,18 +658,6 @@ class Model:
        if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
            # ref: https://huggingface.co/facebook/chameleon-7b
            res = "chameleon"
-        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
-            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
-            res = "minerva-7b"
-        if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
-            # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
-            res = "roberta-bpe"
-        if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
-            # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
-            res = "gigachat"
-        if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
-            # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
-            res = "megrez"

        if res is None:
            logger.warning("\n")
@@ -709,9 +680,6 @@ class Model:
        return res
        # Marker: End get_vocab_base_pre

-    def _set_vocab_none(self) -> None:
-        self.gguf_writer.add_tokenizer_model("none")
-
    def _set_vocab_gpt2(self) -> None:
        tokens, toktypes, tokpre = self.get_vocab_base()
        self.gguf_writer.add_tokenizer_model("gpt2")
@@ -1695,184 +1663,6 @@ class LlamaModel(Model):
                raise ValueError(f"Unprocessed experts: {experts}")


-@Model.register("DeciLMForCausalLM")
-class DeciModel(Model):
-    model_arch = gguf.MODEL_ARCH.DECI
-
-    @staticmethod
-    def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
-        # DeciLM-specific code
-        intermediate_size = int(2 * ffn_mult * n_embd / 3)
-        return DeciModel._find_multiple(intermediate_size, 256)
-
-    @staticmethod
-    def _find_multiple(n: int, k: int) -> int:
-        # DeciLM-specific code
-        if n % k == 0:
-            return n
-        return n + k - (n % k)
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
-            _block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
-            assert self.block_count == len(_block_configs)
-            self._num_kv_heads = list()
-            self._num_heads = list()
-            _ffn_multipliers = list()
-            # ***linear attention layer***
-            # if n_heads_in_group is None and replace_with_linear is True
-            # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
-            # ***attention-free layer***
-            # if n_heads_in_group is None and replace_with_linear is False
-            # then _num_kv_heads[il] is 0 and _num_heads[il] is 0
-            # ***normal attention-layer***
-            # if n_heads_in_group is not None, then
-            # _num_kv_heads[il] is num_attention_head // n_heads_in_group and
-            # _num_heads[il] is num_attention_head
-            for il in range(len(_block_configs)):
-                if _block_configs[il]["attention"]["n_heads_in_group"] is None:
-                    if _block_configs[il]["attention"]["replace_with_linear"] is True:
-                        self._num_kv_heads.append(0)
-                        self._num_heads.append(self.hparams["num_attention_heads"])
-                    else:
-                        self._num_kv_heads.append(0)
-                        self._num_heads.append(0)
-                else:
-                    self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
-                    self._num_heads.append(self.hparams["num_attention_heads"])
-                _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
-            assert self.block_count == len(self._num_kv_heads)
-            assert self.block_count == len(self._num_heads)
-            assert self.block_count == len(_ffn_multipliers)
-            assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
-            assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
-            assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
-            self._ffn_dims: list[int] = [
-                DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
-                for multiplier in _ffn_multipliers
-            ]
-
-    def set_vocab(self):
-        # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
-        # eos_token from '|eot_id|' to '|end_of_text|'
-        if self.hparams.get("vocab_size", 128256) == 128256:
-            tokens, toktypes, tokpre = self.get_vocab_base()
-            self.gguf_writer.add_tokenizer_model("gpt2")
-            self.gguf_writer.add_tokenizer_pre(tokpre)
-            self.gguf_writer.add_token_list(tokens)
-            self.gguf_writer.add_token_types(toktypes)
-
-            special_vocab = gguf.SpecialVocab(
-                self.dir_model, load_merges=True,
-                special_token_types = ['bos', 'eos', 'eom', 'eot']
-            )
-            special_vocab._set_special_token("bos", 128000)
-            special_vocab._set_special_token("eos", 128001)
-            special_vocab._set_special_token("eom", 128008)
-            special_vocab._set_special_token("eot", 128009)
-            special_vocab.add_to_gguf(self.gguf_writer)
-        else:
-            # DeciLM-7B
-            self._set_vocab_llama_hf()
-#            self._set_vocab_gpt2()
-
-    def set_gguf_parameters(self):
-        if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
-            assert self.block_count == len(self._num_kv_heads)
-            assert self.block_count == len(self._num_heads)
-            assert self.block_count == len(self._ffn_dims)
-            self.gguf_writer.add_head_count_kv(self._num_kv_heads)
-            self.gguf_writer.add_head_count(self._num_heads)
-            self.gguf_writer.add_feed_forward_length(self._ffn_dims)
-            self.gguf_writer.add_block_count(self.block_count)
-            self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-            self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-            self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-            self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
-            self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
-            self.gguf_writer.add_file_type(self.ftype)
-        else: # DeciLM-7B
-            super().set_gguf_parameters()
-            if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
-                self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
-                assert self.block_count == len(self._num_kv_heads)
-                self.gguf_writer.add_head_count_kv(self._num_kv_heads)
-        hparams = self.hparams
-        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-
-        if "head_dim" in hparams:
-            rope_dim = hparams["head_dim"]
-        else:
-            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(rope_dim)
-
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "linear":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
-
-    @staticmethod
-    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
-        if n_head_kv is not None and n_head != n_head_kv:
-            n_head = n_head_kv
-        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-                .swapaxes(1, 2)
-                .reshape(weights.shape))
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        n_head = self.hparams["num_attention_heads"]
-        if bid is not None:
-            if "num_key_value_heads_per_layer" in self.hparams:
-                n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
-            elif "block_configs" in self.hparams:
-                n_kv_head = self._num_kv_heads[bid]
-                n_head = self._num_heads[bid]
-            else:
-                n_kv_head = self.hparams.get("num_key_value_heads")
-        else:
-            n_kv_head = self.hparams.get("num_key_value_heads")
-
-        if name.endswith(("q_proj.weight", "q_proj.bias")):
-            data_torch = DeciModel.permute(data_torch, n_head, n_head)
-        if name.endswith(("k_proj.weight", "k_proj.bias")):
-            data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
-        return [(self.map_tensor_name(name), data_torch)]
-
-    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
-            if rope_scaling.get("rope_type", '').lower() == "llama3":
-                base = self.hparams.get("rope_theta", 10000.0)
-                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
-                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
-
-                factor = rope_scaling.get("factor", 8.0)
-                low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
-                high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
-
-                low_freq_wavelen = old_context_len / low_freq_factor
-                high_freq_wavelen = old_context_len / high_freq_factor
-                assert low_freq_wavelen != high_freq_wavelen
-
-                rope_factors = []
-                for freq in freqs:
-                    wavelen = 2 * math.pi / freq
-                    if wavelen < high_freq_wavelen:
-                        rope_factors.append(1)
-                    elif wavelen > low_freq_wavelen:
-                        rope_factors.append(factor)
-                    else:
-                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
-                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
-
-                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
-
-    def prepare_tensors(self):
-        super().prepare_tensors()
-
-
@Model.register("BitnetForCausalLM")
 class BitnetModel(Model):
    model_arch = gguf.MODEL_ARCH.BITNET
@@ -2041,40 +1831,29 @@ class MiniCPMModel(Model):
    model_arch = gguf.MODEL_ARCH.MINICPM

    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        embedding_scale = float(self.hparams["scale_emb"])
-        self.gguf_writer.add_embedding_scale(embedding_scale)
-        logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
-        residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
-        self.gguf_writer.add_residual_scale(residual_scale)
-        logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
-        logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
-        self.gguf_writer.add_logit_scale(logit_scale)
-        logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
-        if self.hparams.get("rope_scaling") is not None:
-            if self.hparams["rope_scaling"].get("type") == "longrope":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
-                logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
-
-    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-
-        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if rope_scaling is not None:
-            long_factors = rope_scaling.get('long_factor', None)
-            short_factors = rope_scaling.get('short_factor', None)
-
-            if long_factors is None or short_factors is None:
-                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
-
-            if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
-                raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
-
-            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
-            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
+        block_count = self.hparams["num_hidden_layers"]
+        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+        self.gguf_writer.add_file_type(self.ftype)

    def set_vocab(self):
-        self._set_vocab_sentencepiece()
+        self._set_vocab_llama_hf()
+
+    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
+        if n_kv_head is not None and n_head != n_kv_head:
+            n_head //= n_kv_head
+
+        return (
+            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape)
+        )

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
@@ -2084,9 +1863,9 @@ class MiniCPMModel(Model):

        # HF models permute some of the tensors, so we need to undo that
        if name.endswith(("q_proj.weight")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+            data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
        if name.endswith(("k_proj.weight")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+            data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)

        return [(self.map_tensor_name(name), data_torch)]

@@ -2196,75 +1975,6 @@ class Qwen2Model(Model):
        except FileNotFoundError:
            self._set_vocab_gpt2()

-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "yarn":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
-                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
-
-
-@Model.register("Qwen2VLForConditionalGeneration")
-class Qwen2VLModel(Model):
-    model_arch = gguf.MODEL_ARCH.QWEN2VL
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        mrope_section = self.hparams["rope_scaling"]["mrope_section"]
-        mrope_section += [0] * max(0, 4 - len(mrope_section))
-        self.gguf_writer.add_rope_dimension_sections(mrope_section)
-
-    def set_vocab(self):
-        try:
-            self._set_vocab_sentencepiece()
-        except FileNotFoundError:
-            self._set_vocab_gpt2()
-
-    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
-        for name, data in super().get_tensors():
-            if name.startswith("visual."):
-                continue
-            yield name, data
-
-
-@Model.register("WavTokenizerDec")
-class WavTokenizerDecModel(Model):
-    model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unused
-
-        if \
-                name.endswith("codebook.cluster_size") or \
-                name.endswith("codebook.embed_avg") or \
-                name.endswith("codebook.inited"):
-            logger.debug(f"Skipping {name!r}")
-            return []
-
-        logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-    def set_vocab(self):
-        self._set_vocab_none()
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_vocab_size         (self.hparams["vocab_size"])
-        self.gguf_writer.add_features_length    (self.hparams["n_embd_features"])
-        self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
-        self.gguf_writer.add_group_norm_eps     (self.hparams["group_norm_epsilon"])
-        self.gguf_writer.add_group_norm_groups  (self.hparams["group_norm_groups"])
-
-        self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
-        self.gguf_writer.add_posnet_block_count     (self.hparams["posnet"]["n_layer"])
-
-        self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
-        self.gguf_writer.add_convnext_block_count     (self.hparams["convnext"]["n_layer"])
-
-        self.gguf_writer.add_causal_attention(False)
-

@Model.register("Qwen2MoeForCausalLM")
 class Qwen2MoeModel(Model):
@@ -2394,15 +2104,6 @@ class Phi3MiniModel(Model):
    model_arch = gguf.MODEL_ARCH.PHI3

    def set_vocab(self):
-        # Phi-4 model uses GPT2Tokenizer
-        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
-        if tokenizer_config_file.is_file():
-            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
-                tokenizer_config_json = json.load(f)
-                tokenizer_class = tokenizer_config_json['tokenizer_class']
-                if tokenizer_class == 'GPT2Tokenizer':
-                    return self._set_vocab_gpt2()
-
        from sentencepiece import SentencePieceProcessor

        tokenizer_path = self.dir_model / 'tokenizer.model'
@@ -2519,11 +2220,7 @@ class Phi3MiniModel(Model):
        self.gguf_writer.add_rope_dimension_count(rope_dims)
        self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
        self.gguf_writer.add_file_type(self.ftype)
-        sliding_window = self.hparams.get("sliding_window")
-        # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
-        if sliding_window is None:
-            sliding_window = 0
-        self.gguf_writer.add_sliding_window(sliding_window)
+        self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
@@ -2822,7 +2519,7 @@ class InternLM2Model(Model):
            return [(self.map_tensor_name(name), data_torch)]


-@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
+@Model.register("BertModel", "CamembertModel")
 class BertModel(Model):
    model_arch = gguf.MODEL_ARCH.BERT

@@ -2863,8 +2560,7 @@ class BertModel(Model):

        # we need this to validate the size of the token_type embeddings
        # though currently we are passing all zeros to the token_type embeddings
-        # "Sequence A" or "Sequence B"
-        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
+        self.gguf_writer.add_token_type_count(2)  # "Sequence A" or "Sequence B"

        # convert to phantom space vocab
        def phantom(tok):
@@ -2888,73 +2584,13 @@ class BertModel(Model):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused

-        if name.startswith("bert."):
-            name = name[5:]
-
-        if name.endswith(".gamma"):
-            name = name[:-6] + ".weight"
-
-        if name.endswith(".beta"):
-            name = name[:-5] + ".bias"
-
        # we are only using BERT for embeddings so we don't need the pooling layer
        if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
            return [] # we don't need these

-        if name.startswith("cls.predictions"):
-            return []
-
-        if name.startswith("cls.seq_relationship"):
-            return []
-
        return [(self.map_tensor_name(name), data_torch)]


-@Model.register("RobertaModel")
-class RobertaModel(BertModel):
-    model_arch = gguf.MODEL_ARCH.BERT
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # we need the pad_token_id to know how to chop down position_embd matrix
-        if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
-            self._position_offset = 1 + pad_token_id
-            if "max_position_embeddings" in self.hparams:
-                self.hparams["max_position_embeddings"] -= self._position_offset
-        else:
-            self._position_offset = None
-
-    def set_vocab(self):
-        """Support BPE tokenizers for roberta models"""
-        bpe_tok_path = self.dir_model / "tokenizer.json"
-        if bpe_tok_path.exists():
-            self._set_vocab_gpt2()
-            self.gguf_writer.add_add_bos_token(True)
-            self.gguf_writer.add_add_eos_token(True)
-
-            # we need this to validate the size of the token_type embeddings
-            # though currently we are passing all zeros to the token_type embeddings
-            # "Sequence A" or "Sequence B"
-            self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
-
-        else:
-            return super().set_vocab()
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # if name starts with "roberta.", remove the prefix
-        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
-        if name.startswith("roberta."):
-            name = name[8:]
-
-        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
-        if name == "embeddings.position_embeddings.weight":
-            if self._position_offset is not None:
-                data_torch = data_torch[self._position_offset:,:]
-
-        return super().modify_tensors(data_torch, name, bid)
-
-
@Model.register("NomicBertModel")
 class NomicBertModel(BertModel):
    model_arch = gguf.MODEL_ARCH.NOMIC_BERT
@@ -3274,9 +2910,6 @@ class Rwkv6Model(Model):
        if new_name.endswith("time_mix_w2.weight"):
            data_torch = data_torch.permute(0, 2, 1)

-        if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
-            data_torch = data_torch.squeeze()
-
        rescale_every_n_layers = self.hparams["rescale_every"]
        if rescale_every_n_layers > 0:
            if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
@@ -3745,97 +3378,6 @@ class ArcticModel(Model):
                raise ValueError(f"Unprocessed experts: {experts}")


-@Model.register("DeepseekForCausalLM")
-class DeepseekModel(Model):
-    model_arch = gguf.MODEL_ARCH.DEEPSEEK
-
-    def set_vocab(self):
-        try:
-            self._set_vocab_sentencepiece()
-        except FileNotFoundError:
-            self._set_vocab_gpt2()
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-        if "head_dim" in hparams:
-            rope_dim = hparams["head_dim"]
-        else:
-            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
-
-        self.gguf_writer.add_rope_dimension_count(rope_dim)
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
-        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
-        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
-        self.gguf_writer.add_expert_weights_scale(1.0)
-        self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
-        self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
-
-    _experts: list[dict[str, Tensor]] | None = None
-
-    @staticmethod
-    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
-        if n_head_kv is not None and n_head != n_head_kv:
-            n_head = n_head_kv
-        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-                .swapaxes(1, 2)
-                .reshape(weights.shape))
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        n_head = self.hparams["num_attention_heads"]
-        n_kv_head = self.hparams.get("num_key_value_heads")
-
-        if name.endswith(("q_proj.weight", "q_proj.bias")):
-            data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
-        if name.endswith(("k_proj.weight", "k_proj.bias")):
-            data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
-
-        # process the experts separately
-        if name.find("mlp.experts") != -1:
-            n_experts = self.hparams["n_routed_experts"]
-            assert bid is not None
-
-            if self._experts is None:
-                self._experts = [{} for _ in range(self.block_count)]
-
-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
-                tensors: list[tuple[str, Tensor]] = []
-
-                # merge the experts into a single 3d tensor
-                for w_name in ["down_proj", "gate_proj", "up_proj"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
-                        datas.append(self._experts[bid][ename])
-                        del self._experts[bid][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-
-                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
-
-                    new_name = self.map_tensor_name(merged_name)
-
-                    tensors.append((new_name, data_torch))
-                return tensors
-            else:
-                return []
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-    def prepare_tensors(self):
-        super().prepare_tensors()
-
-        if self._experts is not None:
-            # flatten `list[dict[str, Tensor]]` into `list[str]`
-            experts = [k for d in self._experts for k in d.keys()]
-            if len(experts) > 0:
-                raise ValueError(f"Unprocessed experts: {experts}")
-
-
@Model.register("DeepseekV2ForCausalLM")
 class DeepseekV2Model(Model):
    model_arch = gguf.MODEL_ARCH.DEEPSEEK2
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -17,7 +17,7 @@
 #
 #   python3 convert_hf_to_gguf_update.py <huggingface_token>
 #
-# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
+# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
 # - Update llama.cpp with the new pre-tokenizer if necessary
 #
 # TODO: generate tokenizer tests for llama.cpp
@@ -72,7 +72,6 @@ models = [
    {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
    {"name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
    {"name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
-    {"name": "falcon3",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
    {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
    {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
    {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
@@ -103,10 +102,6 @@ models = [
    {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
    {"name": "phi-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
    {"name": "chameleon",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
-    {"name": "minerva-7b",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
-    {"name": "roberta-bpe",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
-    {"name": "gigachat",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
-    {"name": "megrez",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
 ]


--- a/docs/build.md
+++ b/docs/build.md
@@ -26,24 +26,19 @@ cmake --build build --config Release

    1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):

-       ```bash
-       cmake -B build -DCMAKE_BUILD_TYPE=Debug
-       cmake --build build
-       ```
+    ```bash
+    cmake -B build -DCMAKE_BUILD_TYPE=Debug
+    cmake --build build
+    ```

    2. Multi-config generators (`-G` param set to Visual Studio, XCode...):

-       ```bash
-       cmake -B build -G "Xcode"
-       cmake --build build --config Debug
-       ```
+    ```bash
+    cmake -B build -G "Xcode"
+    cmake --build build --config Debug
+    ```

    For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
-  ```
-  cmake -B build -DBUILD_SHARED_LIBS=OFF
-  cmake --build build --config Release
-  ```

 - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
    - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
@@ -55,14 +50,7 @@ cmake --build build --config Release
    cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
    cmake --build build-arm64-windows-llvm-release
    ```
-    Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels.
-
-    For building with ninja generator and clang compiler as default:
-      -set path:set LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\x64;C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.41.34120\lib\x64\uwp;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\x64
-      ```bash
-      cmake --preset x64-windows-llvm-release
-      cmake --build build-x64-windows-llvm-release
-      ```
+    Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.

 ## BLAS Build

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,12 +20,7 @@ else()
    add_subdirectory(batched)
    add_subdirectory(embedding)
    add_subdirectory(eval-callback)
-
-    if (NOT WIN32)
-        # disabled on Windows because it uses internal functions not exported with LLAMA_API
-        add_subdirectory(gbnf-validator)
-    endif()
-
+    add_subdirectory(gbnf-validator)
    add_subdirectory(gguf-hash)
    add_subdirectory(gguf-split)
    add_subdirectory(gguf)
@@ -51,17 +46,12 @@ else()
    add_subdirectory(speculative)
    add_subdirectory(speculative-simple)
    add_subdirectory(tokenize)
-    add_subdirectory(tts)
-    add_subdirectory(gen-docs)
    if (NOT GGML_BACKEND_DL)
        # these examples use the backends directly and cannot be built with dynamic loading
        add_subdirectory(convert-llama2c-to-ggml)
        add_subdirectory(cvector-generator)
        add_subdirectory(export-lora)
-        if (NOT WIN32)
-            # disabled on Windows because it uses internal functions not exported with LLAMA_API
-            add_subdirectory(quantize-stats)
-        endif()
+        add_subdirectory(quantize-stats)
        add_subdirectory(llava)
        if (GGML_RPC)
            add_subdirectory(rpc)
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -65,7 +65,6 @@ int main(int argc, char ** argv) {
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

    auto sparams = llama_sampler_chain_default_params();
-    sparams.no_perf = false;

    llama_sampler * smpl = llama_sampler_chain_init(sparams);

--- a/examples/cvector-generator/mean.hpp
+++ b/examples/cvector-generator/mean.hpp
@@ -15,7 +15,7 @@ static void run(
    for (size_t il = 0; il < v_input.size(); ++il) {
        // prepare output vector
        struct ggml_tensor * ctrl_out = v_output[il];
-        ggml_format_name(ctrl_out, "direction.%zu", il+1);
+        ggml_format_name(ctrl_out, "direction.%ld", il+1);

        // calculate mean vector
        struct ggml_tensor * t_layer = v_input[il];
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@@ -302,7 +302,7 @@ static void run_pca(

        // prepare output vector
        struct ggml_tensor * ctrl_out = v_output[il];
-        ggml_format_name(ctrl_out, "direction.%zu", il+1);
+        ggml_format_name(ctrl_out, "direction.%ld", il+1);

        // run power_iteration
        params.i_layer = il;
--- a/examples/deprecation-warning/deprecation-warning.cpp
+++ b/examples/deprecation-warning/deprecation-warning.cpp
@@ -12,7 +12,7 @@ int main(int argc, char** argv) {
    }

    // Get only the program name from the full path
-    auto pos = filename.find_last_of("/\\");
+    auto pos = filename.find_last_of('/');
    if (pos != std::string::npos) {
        filename = filename.substr(pos+1);
    }
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -265,8 +265,8 @@ struct lora_merge_ctx {
            fout.write((const char *)data.data(), data.size());
        }

-        printf("%s : merged %zu tensors with lora adapters\n", __func__, n_merged);
-        printf("%s : wrote %zu tensors to output file\n", __func__, trans.size());
+        printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
+        printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
    }

    void copy_tensor(struct ggml_tensor * base) {
@@ -352,7 +352,7 @@ struct lora_merge_ctx {
                const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
                delta = ggml_scale(ctx0, delta, scale);
                cur = ggml_add(ctx0, delta, cur);
-                printf("%s :   + merging from adapter[%zu] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
+                printf("%s :   + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
                printf("%s :     input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
            }
            cur = ggml_cast(ctx0, cur, out->type);
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -11,15 +11,19 @@
 static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
    const auto cpts = unicode_cpts_from_utf8(input_str);

-    auto & stacks_cur = llama_grammar_get_stacks(grammar);
+    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
+          llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);

    size_t pos = 0;
    for (const auto & cpt : cpts) {
-        llama_grammar_accept(grammar, cpt);
+        const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
+
+        llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);

        if (stacks_cur.empty()) {
            error_pos = pos;
            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
+            stacks_cur = stacks_prev;
            return false;
        }
        ++pos;
@@ -78,8 +82,7 @@ int main(int argc, char** argv) {

    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
    if (grammar == nullptr) {
-        fprintf(stdout, "Failed to initialize llama_grammar\n");
-        return 1;
+        throw std::runtime_error("Failed to initialize llama_grammar");
    }
    // Read the input file
    std::string input_str;
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -287,7 +287,7 @@ struct split_strategy {
    }

    void print_info() {
-        printf("n_split: %zu\n", ctx_outs.size());
+        printf("n_split: %ld\n", ctx_outs.size());
        int i_split = 0;
        for (auto & ctx_out : ctx_outs) {
            // re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
@@ -297,7 +297,7 @@ struct split_strategy {
                total_size += ggml_nbytes(t);
            }
            total_size = total_size / 1000 / 1000; // convert to megabytes
-            printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
+            printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
            i_split++;
        }
    }
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -75,7 +75,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
        }

        std::vector<float> emb_norm(emb_unorm.size());
-        common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd, 2);
+        common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
        result.push_back(emb_norm);

 #ifdef GRIT_DEBUG
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -14,7 +14,7 @@ In this section, we cover the most commonly used options for running the `infill
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
 -   `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.

 ## Input Prompts
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1521,7 +1521,7 @@ int main(int argc, char ** argv) {
    for (const auto & inst : params_instances) {
        params_idx++;
        if (params.progress) {
-            fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
+            fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
        }
        // keep the same model between tests when possible
        if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@@ -1573,14 +1573,14 @@ int main(int argc, char ** argv) {
        // warmup run
        if (t.n_prompt > 0) {
            if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
+                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
            }
            //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
            test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
        }
        if (t.n_gen > 0) {
            if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
+                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
            }
            test_gen(ctx, 1, t.n_threads);
        }
@@ -1592,14 +1592,14 @@ int main(int argc, char ** argv) {

            if (t.n_prompt > 0) {
                if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
+                    fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count,
                            i + 1, params.reps);
                }
                test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
            }
            if (t.n_gen > 0) {
                if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
+                    fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count,
                            i + 1, params.reps);
                }
                test_gen(ctx, t.n_gen, t.n_threads);
--- a/examples/llama.android/llama/build.gradle.kts
+++ b/examples/llama.android/llama/build.gradle.kts
@@ -19,7 +19,6 @@ android {
        externalNativeBuild {
            cmake {
                arguments += "-DLLAMA_BUILD_COMMON=ON"
-                arguments += "-DGGML_LLAMAFILE=OFF"
                arguments += "-DCMAKE_BUILD_TYPE=Release"
                cppFlags += listOf()
                arguments += listOf()
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -305,9 +305,7 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
 extern "C"
 JNIEXPORT void JNICALL
 Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
-    //llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
-    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
-    delete batch;
+    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
 }

 extern "C"
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -210,20 +210,20 @@ actor LlamaContext {

            llama_kv_cache_clear(context)

-            let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
+            let t_pp_start = ggml_time_us()

            if llama_decode(context, batch) != 0 {
                print("llama_decode() failed during prompt")
            }
            llama_synchronize(context)

-            let t_pp_end = DispatchTime.now().uptimeNanoseconds / 1000;
+            let t_pp_end = ggml_time_us()

            // bench text generation

            llama_kv_cache_clear(context)

-            let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
+            let t_tg_start = ggml_time_us()

            for i in 0..<tg {
                llama_batch_clear(&batch)
@@ -238,7 +238,7 @@ actor LlamaContext {
                llama_synchronize(context)
            }

-            let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
+            let t_tg_end = ggml_time_us()

            llama_kv_cache_clear(context)

--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
@@ -7,7 +7,6 @@
 	objects = {

 /* Begin PBXBuildFile section */
-		1809696D2D05A39F00400EE8 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = 1809696C2D05A39F00400EE8 /* llama */; };
 		549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
 		79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
 		7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
@@ -18,6 +17,7 @@
 		8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
 		8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
 		8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
+		DF810E132B4A5BA200301144 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = DF810E122B4A5BA200301144 /* llama */; };
 		F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
 /* End PBXBuildFile section */

@@ -42,7 +42,7 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				1809696D2D05A39F00400EE8 /* llama in Frameworks */,
+				DF810E132B4A5BA200301144 /* llama in Frameworks */,
 				549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
 				8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
 			);
@@ -151,7 +151,7 @@
 			);
 			name = llama.swiftui;
 			packageProductDependencies = (
-				1809696C2D05A39F00400EE8 /* llama */,
+				DF810E122B4A5BA200301144 /* llama */,
 			);
 			productName = llama.swiftui;
 			productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
@@ -429,7 +429,7 @@
 /* End XCConfigurationList section */

 /* Begin XCSwiftPackageProductDependency section */
-		1809696C2D05A39F00400EE8 /* llama */ = {
+		DF810E122B4A5BA200301144 /* llama */ = {
 			isa = XCSwiftPackageProductDependency;
 			productName = llama;
 		};
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -43,10 +43,3 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
-
-set(TARGET llama-qwen2vl-cli)
-add_executable(${TARGET} qwen2vl-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -8,25 +8,21 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"

-//#ifdef GGML_USE_CUDA
-//#include "ggml-cuda.h"
-//#endif
-//
-//#ifdef GGML_USE_SYCL
-//#include "ggml-sycl.h"
-//#endif
-//
-//#ifdef GGML_USE_METAL
-//#include "ggml-metal.h"
-//#endif
-//
-//#ifdef GGML_USE_CANN
-//#include "ggml-cann.h"
-//#endif
-//
-//#ifdef GGML_USE_VULKAN
-//#include "ggml-vulkan.h"
-//#endif
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#ifdef GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+
+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif

 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
@@ -102,9 +98,7 @@ static std::string format(const char * fmt, ...) {
 #define KEY_HAS_LLAVA_PROJ      "clip.has_llava_projector"
 #define KEY_HAS_MINICPMV_PROJ   "clip.has_minicpmv_projector"
 #define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
-#define KEY_HAS_QWEN2VL_MERGER  "clip.has_qwen2vl_merger"
 #define KEY_USE_GELU            "clip.use_gelu"
-#define KEY_USE_SILU            "clip.use_silu"
 #define KEY_N_EMBD              "clip.%s.embedding_length"
 #define KEY_N_FF                "clip.%s.feed_forward_length"
 #define KEY_N_BLOCK             "clip.%s.block_count"
@@ -131,8 +125,7 @@ static std::string format(const char * fmt, ...) {
 #define TN_TOKEN_EMBD      "%s.token_embd.weight"
 #define TN_POS_EMBD        "%s.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
-#define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backwrad compat
-#define TN_PATCH_EMBD_1    "v.patch_embd.weight.1"
+#define TN_PATCH_EMBD      "v.patch_embd.weight"
 #define TN_PATCH_BIAS      "v.patch_embd.bias"
 #define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
 #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
@@ -166,7 +159,6 @@ enum projector_type {
    PROJECTOR_TYPE_LDP,
    PROJECTOR_TYPE_LDPV2,
    PROJECTOR_TYPE_RESAMPLER,
-    PROJECTOR_TYPE_MERGER,
    PROJECTOR_TYPE_UNKNOWN,
 };

@@ -175,7 +167,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_LDP, "ldp" },
    { PROJECTOR_TYPE_LDPV2, "ldpv2"},
    { PROJECTOR_TYPE_RESAMPLER, "resampler"},
-    { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
 };


@@ -468,8 +459,7 @@ struct clip_vision_model {

    // embeddings
    struct ggml_tensor * class_embedding;
-    struct ggml_tensor * patch_embeddings_0;
-    struct ggml_tensor * patch_embeddings_1;  // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
+    struct ggml_tensor * patch_embeddings;
    struct ggml_tensor * patch_bias;
    struct ggml_tensor * position_embeddings;

@@ -559,7 +549,6 @@ struct clip_ctx {
    bool has_vision_encoder  = false;
    bool has_llava_projector = false;
    bool has_minicpmv_projector = false;
-    bool has_qwen2vl_merger = false;
    int minicpmv_version = 2;

    struct clip_vision_model vision_model;
@@ -568,7 +557,6 @@ struct clip_ctx {
    float image_mean[3];
    float image_std[3];
    bool use_gelu = false;
-    bool use_silu = false;
    int32_t ftype = 1;

    bool has_class_embedding = true;
@@ -614,26 +602,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            image_size_height = imgs->data->ny;
        }
    }
-    else if (ctx->has_qwen2vl_merger) {
-        // use the image's native resolution when image is avaible
-        if (is_inf) {
-        // if (imgs->data->nx && imgs->data->ny) {
-            image_size_width  = imgs->data->nx;
-            image_size_height = imgs->data->ny;
-        }
-    }
    const int patch_size           = hparams.patch_size;
    const int num_patches          = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int patches_w            = image_size_width / patch_size;
-    const int patches_h            = image_size_height / patch_size;
    const int num_positions        = num_patches + (ctx->has_class_embedding ? 1 : 0);
-    const int num_position_ids     = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
    const int hidden_size          = hparams.hidden_size;
    const int n_head               = hparams.n_head;
    const int d_head               = hidden_size / n_head;
    int n_layer                    = hparams.n_layer;
    const float eps                = hparams.eps;
-    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};

    const int batch_size = imgs->size;

@@ -654,30 +630,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    ggml_set_name(inp_raw, "inp_raw");
    ggml_set_input(inp_raw);

-    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);

-    if (ctx->has_qwen2vl_merger) {
-        GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
-        GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
-
-        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-        inp = ggml_add(ctx0, inp, inp_1);
-        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3));  // [w, h, c, b] -> [c, w, h, b]
-        inp = ggml_reshape_4d(
-            ctx0, inp,
-            hidden_size * 2, patches_w / 2, patches_h, batch_size);
-        inp = ggml_reshape_4d(
-            ctx0, inp,
-            hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
-        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
-        inp = ggml_reshape_3d(
-            ctx0, inp,
-            hidden_size, patches_w * patches_h, batch_size);
-    }
-    else {
-        inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
-        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
-    }
+    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
+    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));

    if (ctx->has_patch_bias) {
        // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
@@ -699,14 +655,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        }
    }

-    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
    ggml_set_name(positions, "positions");
    ggml_set_input(positions);

-    if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding
-        embeddings =
-            ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
-    }
+    embeddings =
+        ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));

    if (ctx->has_minicpmv_projector) {
        int pos_w = image_size_width/patch_size;
@@ -730,8 +684,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    }

    // loop over layers
-    if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) {
-        // TODO: figure out why we doing thing in this way ???
+    if (ctx->has_minicpmv_projector) {
        n_layer += 1;
    }
    for (int il = 0; il < n_layer - 1; il++) {
@@ -753,13 +706,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            struct ggml_tensor * Q =
                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);

-            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
-            if (ctx->has_qwen2vl_merger) {
-                Q = ggml_rope_multi(
-                    ctx0, Q, positions, nullptr,
-                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-            }
            Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
+            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
            Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);

@@ -767,11 +715,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);

            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
-            if (ctx->has_qwen2vl_merger) {
-                K = ggml_rope_multi(
-                    ctx0, K, positions, nullptr,
-                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-            }
            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);

@@ -811,8 +754,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32

        if (ctx->use_gelu) {
            cur = ggml_gelu_inplace(ctx0, cur);
-        } else if (ctx->use_silu) {
-            cur = ggml_silu_inplace(ctx0, cur);
        } else {
            cur = ggml_gelu_quick_inplace(ctx0, cur);
        }
@@ -824,7 +765,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        cur = ggml_add(ctx0, embeddings, cur);

        embeddings = cur;
-
    }

    // post-layernorm
@@ -896,7 +836,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
                mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
                // stride = 1, padding = 1, bias is nullptr
-                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);

                // layer norm
                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
@@ -944,7 +884,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            // block_2
            {
                // stride = 2
-                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);

                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                // layer norm
@@ -1005,7 +945,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            // mlp_2 ne [24, 24, 2048, 1]
            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
            // weight ne = [3, 3, 2048, 1]
-            struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+            struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
@@ -1086,19 +1026,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            GGML_ASSERT(false);
        }
    }
-    else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
-        embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
-
-        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
-
-        // GELU activation
-        embeddings = ggml_gelu(ctx0, embeddings);
-
-        // Second linear layer
-        embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
-    }

    // build the graph
    ggml_build_forward_expand(gf, embeddings);
@@ -1222,30 +1149,25 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }
    }

-//#ifdef GGML_USE_CUDA
-//    new_clip->backend = ggml_backend_cuda_init(0);
-//    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_METAL
-//    new_clip->backend = ggml_backend_metal_init();
-//    LOG_INF("%s: CLIP using Metal backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_CANN
-//    new_clip->backend = ggml_backend_cann_init(0);
-//    LOG_INF("%s: CLIP using CANN backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_VULKAN
-//    new_clip->backend = ggml_backend_vk_init(0);
-//    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_SYCL
-//    new_clip->backend = ggml_backend_sycl_init(0);
-//    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
-//#endif
+#ifdef GGML_USE_CUDA
+    new_clip->backend = ggml_backend_cuda_init(0);
+    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
+#endif
+
+#ifdef GGML_USE_METAL
+    new_clip->backend = ggml_backend_metal_init();
+    LOG_INF("%s: CLIP using Metal backend\n", __func__);
+#endif
+
+#ifdef GGML_USE_CANN
+    new_clip->backend = ggml_backend_cann_init(0);
+    LOG_INF("%s: CLIP using CANN backend\n", __func__);
+#endif
+
+#ifdef GGML_USE_VULKAN
+    new_clip->backend = ggml_backend_vk_init(0);
+    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
+#endif

    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
@@ -1275,10 +1197,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
        }

-        idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
-        if (idx != -1) {
-            new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
-        }
        // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search

        GGML_ASSERT(new_clip->has_vision_encoder);
@@ -1287,13 +1205,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        idx = get_key_idx(ctx, KEY_USE_GELU);
        new_clip->use_gelu = gguf_get_val_bool(ctx, idx);

-        try {
-            idx = get_key_idx(ctx, KEY_USE_SILU);
-            new_clip->use_silu = gguf_get_val_bool(ctx, idx);
-        } catch (std::runtime_error & /*e*/) {
-            new_clip->use_silu = false;
-        }
-
        if (verbosity >= 1) {
            LOG_INF("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
            LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
@@ -1469,16 +1380,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }

        try {
-            vision_model.patch_embeddings_0    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
+            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
        } catch(const std::exception& /*e*/) {
            LOG_ERR("%s: failed to load vision model tensors\n", __func__);
        }
-        try {
-            vision_model.patch_embeddings_1    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
-        } catch(const std::exception& /*e*/) {
-            new_clip->has_qwen2vl_merger = false;
-        }

        // LLaVA projection
        if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
@@ -1566,12 +1472,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
            vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
        }
-        else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
-            vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
-            vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
-            vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
-            vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
-        }
        else {
            std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
            throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@@ -1610,7 +1510,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
        clip_image_f32_batch batch;
        batch.size = 1;
-        batch.data = nullptr;
        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
        ggml_gallocr_reserve(new_clip->compute_alloc, gf);
        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
@@ -1624,10 +1523,6 @@ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size
    ctx_clip->load_image_size = load_image_size;
 }

-struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
-    return ctx_clip->load_image_size;
-}
-
 struct clip_image_size * clip_image_size_init() {
    struct clip_image_size * load_image_size = new struct clip_image_size();
    load_image_size->width = 448;
@@ -2080,23 +1975,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
        }
        return true;
    }
-    else if (ctx->has_qwen2vl_merger) {
-        clip_image_u8 * resized = clip_image_u8_init();
-        auto patch_size = clip_patch_size(ctx) * 2;
-        int nx = ceil((float)img->nx / patch_size) * patch_size;
-        int ny = ceil((float)img->ny / patch_size) * patch_size;
-        bicubic_resize(*img, *resized, nx, ny);
-
-        res_imgs->data = new clip_image_f32[1];
-        // clip_image_f32 * res = clip_image_f32_init();
-        normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std);
-        // res_imgs->data[0] = *res;
-        res_imgs->size = 1;
-
-        // clip_image_f32_free(res);
-        clip_image_u8_free(resized);
-        return true;
-    }

    bool pad_to_square = true;
    if (!ctx->has_vision_encoder) {
@@ -2286,13 +2164,6 @@ size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }

-size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
-    clip_image_f32 img;
-    img.nx = img_w;
-    img.ny = img_h;
-    return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
-}
-
 int32_t clip_image_size(const struct clip_ctx * ctx) {
    return ctx->vision_model.hparams.image_size;
 }
@@ -2314,13 +2185,6 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
 }

 int clip_n_patches(const struct clip_ctx * ctx) {
-    clip_image_f32 img;
-    img.nx = ctx->vision_model.hparams.image_size;
-    img.ny = ctx->vision_model.hparams.image_size;
-    return clip_n_patches_by_img(ctx, &img);
-}
-
-int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
    const auto & params = ctx->vision_model.hparams;

    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
@@ -2334,11 +2198,6 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
        else if (ctx->minicpmv_version == 3) {
            n_patches = 64;
        }
-    } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
-        int patch_size = params.patch_size * 2;
-        int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
-        int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
-        n_patches = x_patch * y_patch;
    }

    return n_patches;
@@ -2467,7 +2326,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    const int image_size = hparams.image_size;
    int image_size_width  = image_size;
    int image_size_height = image_size;
-    if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
+    if (ctx->has_minicpmv_projector) {
        image_size_width  = imgs->data[0].nx;
        image_size_height = imgs->data[0].ny;
    }
@@ -2487,7 +2346,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        for (size_t i = 0; i < imgs->size; i++) {
            const int nx = imgs->data[i].nx;
            const int ny = imgs->data[i].ny;
-            if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
+            if (!ctx->has_minicpmv_projector) {
                GGML_ASSERT(nx == image_size && ny == image_size);
            }

@@ -2545,9 +2404,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
            auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));

            float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
-            for(int i=0;i < pos_w * pos_h; ++i){
-                for(int j=0; j < embed_dim; ++j){
-                    pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j];
+            for(int i=0;i<pos_w * pos_h;++i){
+                for(int j=0;j<embed_dim;++j){
+                    pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
                }
            }

@@ -2567,34 +2426,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
            }
        }

-        if (ctx->has_qwen2vl_merger) {
-            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
-
-            const int pw = image_size_width / patch_size;
-            const int ph = image_size_height / patch_size;
-            int* positions_data = (int*)malloc(ggml_nbytes(positions));
-
-            int ptr = 0;
-            for (int y = 0; y < ph; y+=2)
-            {
-                for (int x = 0; x < pw; x+=2)
-                {
-                    for (int dy = 0; dy < 2; dy++) {
-                        for (int dx = 0; dx < 2; dx++) {
-                            positions_data[ptr]                 = y + dy;
-                            positions_data[num_patches + ptr]     = x + dx;
-                            positions_data[num_patches * 2 + ptr] = y + dy;
-                            positions_data[num_patches * 3 + ptr] = x + dx;
-                            ptr++;
-                        }
-                    }
-                }
-            }
-
-            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
-            free(positions_data);
-        }
-        else {
+        {
            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");

            int* positions_data = (int*)malloc(ggml_nbytes(positions));
@@ -2603,16 +2435,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
            }
            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
            free(positions_data);
+        }

-            {
-                struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
-                int* patches_data = (int*)malloc(ggml_nbytes(patches));
-                for (int i = 0; i < num_patches; i++) {
-                    patches_data[i] = i + 1;
-                }
-                ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
-                free(patches_data);
+        {
+            struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
+            int* patches_data = (int*)malloc(ggml_nbytes(patches));
+            for (int i = 0; i < num_patches; i++) {
+                patches_data[i] = i + 1;
            }
+            ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
+            free(patches_data);
        }
    }

@@ -2785,9 +2617,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
            return 3584;
        }
    }
-    if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
-        return ctx->vision_model.mm_1_b->ne[0];
-    }

    std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
    throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@@ -2799,21 +2628,3 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
    }
    return 0;
 }
-
-bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
-    return ctx->has_qwen2vl_merger;
-}
-
-
-bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
-    clip_image_f32 clip_img;
-    clip_img.buf.resize(h * w * 3);
-    for (int i = 0; i < h*w*3; i++)
-    {
-        clip_img.buf[i] = img[i];
-    }
-    clip_img.nx = w;
-    clip_img.ny = h;
-    clip_image_encode(ctx, n_threads, &clip_img, vec);
-    return true;
-}
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -45,7 +45,6 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity
 CLIP_API void clip_free(struct clip_ctx * ctx);

 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
-CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);

 CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
 CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
@@ -56,13 +55,11 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);

 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);

-CLIP_API int clip_n_patches        (const struct clip_ctx * ctx);
-CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
-CLIP_API int clip_n_mmproj_embd    (const struct clip_ctx * ctx);
+CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
+CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);

 CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
-CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);

 CLIP_API struct clip_image_size * clip_image_size_init();
 CLIP_API struct clip_image_u8  * clip_image_u8_init ();
@@ -89,9 +86,6 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
 CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);

 CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
-CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
-
-CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

 #ifdef __cplusplus
 }
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -259,33 +259,25 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli

    const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);

-    if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
+    if (clip_is_minicpmv(ctx_clip)) {
        std::vector<float *> image_embd_v;
        image_embd_v.resize(img_res_v.size);
        struct clip_image_size * load_image_size = clip_image_size_init();
-
        for (size_t i = 0; i < img_res_v.size; i++) {
            const int64_t t_img_enc_step_start_us = ggml_time_us();
-            image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
+            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
            int patch_size=14;
            load_image_size->width = img_res_v.data[i].nx;
            load_image_size->height = img_res_v.data[i].ny;
            clip_add_load_image_size(ctx_clip, load_image_size);
-
            bool encoded = false;
-            if (clip_is_qwen2vl(ctx_clip)) {
+            int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
+            if (has_minicpmv_projector == 2) {
+                encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
+            }
+            else if (has_minicpmv_projector == 3) {
                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
            }
-            else {
-                int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
-                if (has_minicpmv_projector == 2) {
-                    encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
-                }
-                else if (has_minicpmv_projector == 3) {
-                    encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
-                }
-            }
-
            if (!encoded) {
                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                return false;
@@ -298,11 +290,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli

        int n_img_pos_out = 0;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
-            std::memcpy(
-                image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
-                image_embd_v[i],
-                clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
-            n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
+            std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
+            n_img_pos_out += clip_n_patches(ctx_clip);
        }
        *n_img_pos = n_img_pos_out;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -398,13 +387,7 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
    if (clip_is_minicpmv(ctx_clip)) {
        num_max_patches = 10;
    }
-    float * image_embd;
-    if (clip_is_qwen2vl(ctx_clip)) {
-        // qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
-        image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
-    } else {
-        image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
-    }
+    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
    if (!image_embd) {
        LOG_ERR("Unable to allocate memory for image embeddings\n");
        return false;
--- a/examples/llava/qwen2_vl_surgery.py
+++ b/examples/llava/qwen2_vl_surgery.py
@@ -1,165 +0,0 @@
-import argparse
-from typing import Dict
-
-import torch
-import numpy as np
-from gguf import *
-from transformers import (
-    Qwen2VLForConditionalGeneration,
-    Qwen2VLProcessor,
-    AutoProcessor,
-    Qwen2VLConfig
-)
-
-
-VISION = "clip.vision"
-
-
-def k(raw_key: str, arch: str) -> str:
-    return raw_key.format(arch=arch)
-
-
-def to_gguf_name(name: str) -> str:
-    og = name
-    name = name.replace("text_model", "t").replace("vision_model", "v")
-    name = name.replace("blocks", "blk").replace("embeddings.", "")
-    name = name.replace("attn.", "attn_")
-    name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
-    # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
-    name = name.replace("norm1", "ln1").replace("norm2", "ln2")
-    name = name.replace("merger.mlp", 'mm')
-    print(f"[to_gguf_name] {og} --> {name}")
-    return name
-
-
-def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
-    vision_model = qwen2vl.visual
-    tensor_map = {}
-    for name, ten in vision_model.state_dict().items():
-        ten = ten.numpy()
-        if 'qkv' in name:
-            if ten.ndim == 2: # weight
-                c3, _ = ten.shape
-            else:             # bias
-                c3 = ten.shape[0]
-            assert c3 % 3 == 0
-            c = c3 // 3
-            wq = ten[:c]
-            wk = ten[c: c * 2]
-            wv = ten[c * 2:]
-            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
-            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
-            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
-        elif 'merger' in name:
-            if name.endswith("ln_q.weight"):
-                tensor_map['v.post_ln.weight'] = ten
-            elif name.endswith("ln_q.bias"):
-                tensor_map['v.post_ln.bias'] = ten
-            else:
-                # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
-                tensor_map[to_gguf_name(name)] = ten
-        elif 'patch_embed.proj.weight' in name:
-            # NOTE: split Conv3D into Conv2Ds
-            c1, c2, kt, kh, kw = ten.shape
-            assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
-            tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
-            tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
-        else:
-            tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
-
-    for new_name, ten in tensor_map.items():
-        if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
-            tensor_map[new_name] = ten.astype(np.float32)
-        else:
-            tensor_map[new_name] = ten.astype(dtype)
-    tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32)  # dummy tensor, just here as a placeholder
-    return tensor_map
-
-
-def main(args):
-    if args.data_type == 'fp32':
-        dtype = torch.float32
-        np_dtype = np.float32
-        ftype = 0
-    elif args.data_type == 'fp16':
-        dtype = torch.float32
-        np_dtype = np.float16
-        ftype = 1
-    else:
-        raise ValueError()
-
-    local_model = False
-    model_path = ""
-    model_name = args.model_name
-    print("model_name: ", model_name)
-    qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
-        model_name, torch_dtype=dtype, device_map="cpu"
-    )
-    cfg: Qwen2VLConfig = qwen2vl.config  # type: ignore[reportAssignmentType]
-    vcfg = cfg.vision_config
-
-    if os.path.isdir(model_name):
-        local_model = True
-        if model_name.endswith(os.sep):
-            model_name = model_name[:-1]
-        model_path = model_name
-        model_name = os.path.basename(model_name)
-    fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
-
-    fout = GGUFWriter(path=fname_out, arch="clip")
-    fout.add_description("image encoder for Qwen2VL")
-
-    fout.add_file_type(ftype)
-    fout.add_bool("clip.has_text_encoder", False)
-    fout.add_bool("clip.has_vision_encoder", True)
-    fout.add_bool("clip.has_qwen2vl_merger", True)
-    fout.add_string("clip.projector_type", "qwen2vl_merger")
-
-    print(cfg.vision_config)
-    if 'silu' in cfg.vision_config.hidden_act.lower():
-        fout.add_bool("clip.use_silu", True)
-        fout.add_bool("clip.use_gelu", False)
-    elif 'gelu' in cfg.vision_config.hidden_act.lower():
-        fout.add_bool("clip.use_silu", False)
-        fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower())
-    else:
-        raise ValueError()
-
-    tensor_map = find_vision_tensors(qwen2vl, np_dtype)
-    for name, data in tensor_map.items():
-        fout.add_tensor(name, data)
-
-    fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
-    fout.add_uint32("clip.vision.image_size", 14 * 40)  # some reasonable size that is divable by (14*2)
-    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
-    fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
-    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
-    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
-    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
-    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0)  # not sure what this does, put 0 here as a placeholder
-    fout.add_name(model_name)
-    """
-    HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
-            it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
-    """
-
-    if local_model:
-        processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path)
-    else:
-        processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
-    fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]
-    fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]
-
-    fout.write_header_to_file()
-    fout.write_kv_data_to_file()
-    fout.write_tensors_to_file()
-    fout.close()
-    print("save model as: ", fname_out)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
-    parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
-    args = parser.parse_args()
-    main(args)
--- a/examples/llava/qwen2vl-cli.cpp
+++ b/examples/llava/qwen2vl-cli.cpp
@@ -1,581 +0,0 @@
-#include "arg.h"
-#include "base64.hpp"
-#include "log.h"
-#include "common.h"
-#include "sampling.h"
-#include "clip.h"
-#include "llava.h"
-#include "llama.h"
-#include "ggml.h"
-
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-#ifdef NDEBUG
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-#endif
-
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <vector>
-#include <algorithm>
-#include <iostream>
-#include <fstream>
-
-
-static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
-                                     int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
-    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
-    const int patch_size = 14 * 2;
-    const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
-    const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
-    auto img_tokens = image_embed->n_image_pos;
-    // llama_pos mrope_pos[img_tokens * 4];
-    std::vector<llama_pos> mrope_pos;
-    mrope_pos.resize(img_tokens * 4);
-
-    for (int y = 0; y < ph; y++)
-    {
-        for (int x = 0; x < pw; x++)
-        {
-            int i = y * pw + x;
-            mrope_pos[i] = *st_pos_id;
-            mrope_pos[i + img_tokens] = *st_pos_id + y;
-            mrope_pos[i + img_tokens * 2] = *st_pos_id + x;
-            mrope_pos[i + img_tokens * 3] = 0;
-        }
-    }
-    *st_pos_id += std::max(pw, ph);
-
-    int processed = 0;
-    std::vector<llama_pos> batch_mrope_pos;
-    batch_mrope_pos.resize(img_tokens * 4);
-
-    for (int i = 0; i < img_tokens; i += n_batch) {
-        int n_eval = img_tokens - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-
-        // llama_pos batch_mrope_pos[n_eval * 4];
-        std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0);
-        memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos));
-        memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos));
-        memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
-        memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));
-
-        llama_batch batch = {
-            int32_t(n_eval),                // n_tokens
-            nullptr,                        // token
-            (image_embed->embed+i*n_embd),  // embed
-            batch_mrope_pos.data(),         // pos
-            nullptr,  // n_seq_id
-            nullptr,  // seq_id
-            nullptr,  // logits
-        };
-
-        if (llama_decode(ctx_llama, batch)) {
-            LOG_ERR("%s : failed to eval\n", __func__);
-            return false;
-        }
-        *n_past += n_eval;
-        processed += n_eval;
-    }
-    return true;
-}
-
-
-static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
-    int N = (int) tokens.size();
-    std::vector<llama_pos> pos;
-    for (int i = 0; i < N; i += n_batch) {
-        int n_eval = (int) tokens.size() - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        auto batch = llama_batch_get_one(&tokens[i], n_eval);
-        // TODO: add mrope pos ids somewhere else
-        pos.resize(batch.n_tokens * 4);
-        std::fill(pos.begin(), pos.end(), 0);
-        for (int j = 0; j < batch.n_tokens * 3; j ++) {
-            pos[j] = *st_pos_id + (j % batch.n_tokens);
-        }
-        batch.pos = pos.data();
-
-        if (llama_decode(ctx_llama, batch)) {
-            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
-            return false;
-        }
-        *n_past += n_eval;
-        *st_pos_id += n_eval;
-    }
-    return true;
-}
-
-static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past, int * st_pos_id) {
-    std::vector<llama_token> tokens;
-    tokens.push_back(id);
-    return eval_tokens(ctx_llama, tokens, 1, n_past, st_pos_id);
-}
-
-static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, int * st_pos_id, bool add_bos){
-    std::string              str2     = str;
-    std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
-    eval_tokens(ctx_llama, embd_inp, n_batch, n_past, st_pos_id);
-    return true;
-}
-
-static const char * sample(struct common_sampler * smpl,
-                           struct llama_context * ctx_llama,
-                           int * n_past, int * st_pos_id) {
-    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
-    common_sampler_accept(smpl, id, true);
-    static std::string ret;
-    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
-        ret = "</s>";
-    } else {
-        ret = common_token_to_piece(ctx_llama, id);
-    }
-    eval_id(ctx_llama, id, n_past, st_pos_id);
-    return ret.c_str();
-}
-
-static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
-static const char* IMG_BASE64_TAG_END = "\">";
-
-static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
-    begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
-    end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
-}
-
-static bool prompt_contains_image(const std::string& prompt) {
-    size_t begin, end;
-    find_image_tag_in_prompt(prompt, begin, end);
-    return (begin != std::string::npos);
-}
-
-// replaces the base64 image tag in the prompt with `replacement`
-static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
-    size_t img_base64_str_start, img_base64_str_end;
-    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
-    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
-        LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
-        return NULL;
-    }
-
-    auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
-    auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
-    auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
-
-    auto required_bytes = base64::required_encode_size(base64_str.size());
-    auto img_bytes = std::vector<unsigned char>(required_bytes);
-    base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
-
-    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
-    if (!embed) {
-        LOG_ERR("%s: could not load image from base64 string.\n", __func__);
-        return NULL;
-    }
-
-    return embed;
-}
-
-static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
-    size_t begin, end;
-    find_image_tag_in_prompt(prompt, begin, end);
-    if (begin == std::string::npos || end == std::string::npos) {
-        return prompt;
-    }
-    auto pre = prompt.substr(0, begin);
-    auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
-    return pre + replacement + post;
-}
-
-struct llava_context {
-    struct clip_ctx * ctx_clip = NULL;
-    struct llama_context * ctx_llama = NULL;
-    struct llama_model * model = NULL;
-};
-
-static void print_usage(int, char ** argv) {
-    LOG("\n example usage:\n");
-    LOG("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
-
-static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
-
-    // load and preprocess the image
-    llava_image_embed * embed = NULL;
-    auto prompt = params->prompt;
-    if (prompt_contains_image(prompt)) {
-        if (!params->image.empty()) {
-            LOG_INF("using base64 encoded image instead of command line image path\n");
-        }
-        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
-        if (!embed) {
-            LOG_ERR("%s: can't load image from prompt\n", __func__);
-            return NULL;
-        }
-        params->prompt = remove_image_from_prompt(prompt);
-    } else {
-        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
-        if (!embed) {
-            fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
-            return NULL;
-        }
-    }
-
-    return embed;
-}
-
-static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
-    int n_past = 0;
-    int cur_pos_id = 0;
-
-    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
-
-    std::string system_prompt, user_prompt;
-    size_t image_pos = prompt.find("<|vision_start|>");
-    if (image_pos != std::string::npos) {
-        // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
-        system_prompt = prompt.substr(0, image_pos);
-        user_prompt = prompt.substr(image_pos + std::string("<|vision_pad|>").length());
-        LOG_INF("system_prompt: %s\n", system_prompt.c_str());
-        if (params->verbose_prompt) {
-            auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
-            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
-        LOG_INF("user_prompt: %s\n", user_prompt.c_str());
-        if (params->verbose_prompt) {
-            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
-            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
-    } else {
-        // llava-1.5 native mode
-        system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|>";
-        user_prompt = "<|vision_end|>" + prompt + "<|im_end|>\n<|im_start|>assistant\n";
-        if (params->verbose_prompt) {
-            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
-            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
-    }
-
-    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, true);
-    if (image_embed != nullptr) {
-        auto image_size = clip_get_load_image_size(ctx_llava->ctx_clip);
-        qwen2vl_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past, &cur_pos_id, image_size);
-    }
-    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, false);
-
-    // generate the response
-
-    LOG("\n");
-
-    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
-    if (!smpl) {
-        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
-        exit(1);
-    }
-
-    std::string response = "";
-    for (int i = 0; i < max_tgt_len; i++) {
-        const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past, &cur_pos_id);
-        response += tmp;
-        if (strcmp(tmp, "</s>") == 0) break;
-        if (strstr(tmp, "###")) break; // Yi-VL behavior
-        LOG("%s", tmp);
-        if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
-        if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
-        if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
-
-        fflush(stdout);
-    }
-
-    common_sampler_free(smpl);
-    LOG("\n");
-}
-
-static struct llama_model * llava_init(common_params * params) {
-    llama_backend_init();
-    llama_numa_init(params->numa);
-
-    llama_model_params model_params = common_model_params_to_llama(*params);
-
-    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
-    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n" , __func__);
-        return NULL;
-    }
-    return model;
-}
-
-static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
-    const char * clip_path = params->mmproj.c_str();
-
-    auto prompt = params->prompt;
-    if (prompt.empty()) {
-        prompt = "describe the image in detail.";
-    }
-
-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
-
-
-    llama_context_params ctx_params = common_context_params_to_llama(*params);
-    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
-
-    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
-
-    if (ctx_llama == NULL) {
-        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
-        return NULL;
-    }
-
-    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
-
-    ctx_llava->ctx_llama = ctx_llama;
-    ctx_llava->ctx_clip = ctx_clip;
-    ctx_llava->model = model;
-    return ctx_llava;
-}
-
-static void llava_free(struct llava_context * ctx_llava) {
-    if (ctx_llava->ctx_clip) {
-        clip_free(ctx_llava->ctx_clip);
-        ctx_llava->ctx_clip = NULL;
-    }
-
-    llama_free(ctx_llava->ctx_llama);
-    llama_free_model(ctx_llava->model);
-    llama_backend_free();
-}
-
-#ifndef NDEBUG
-
-static void debug_test_mrope_2d() {
-    // 1. Initialize backend
-    ggml_backend_t backend = NULL;
-    std::string backend_name = "";
-#ifdef GGML_USE_CUDA
-    fprintf(stderr, "%s: using CUDA backend\n", __func__);
-    backend = ggml_backend_cuda_init(0); // init device 0
-    backend_name = "cuda";
-    if (!backend) {
-        fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
-    }
-#endif
-    // if there aren't GPU Backends fallback to CPU backend
-    if (!backend) {
-        backend = ggml_backend_cpu_init();
-        backend_name = "cpu";
-    }
-
-    // Calculate the size needed to allocate
-    size_t ctx_size = 0;
-    ctx_size += 2 * ggml_tensor_overhead(); // tensors
-    // no need to allocate anything else!
-
-    // 2. Allocate `ggml_context` to store tensor data
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx_size,
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
-    };
-    struct ggml_context * ctx = ggml_init(params);
-
-    struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 12, 30);
-    ggml_set_name(inp_raw, "inp_raw");
-    ggml_set_input(inp_raw);
-
-    struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 30 * 4);
-    ggml_set_name(pos, "pos");
-    ggml_set_input(pos);
-
-    std::vector<float> dummy_q;
-    dummy_q.resize(128 * 12 * 30);
-    std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
-    // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
-
-    std::vector<int> pos_id;
-    pos_id.resize(30 * 4);
-    for (int i = 0; i < 30; i ++) {
-        pos_id[i] = i;
-        pos_id[i + 30] = i + 10;
-        pos_id[i + 60] = i + 20;
-        pos_id[i + 90] = i + 30;
-    }
-    int sections[4] = {32, 32, 0, 0};
-
-    // 4. Allocate a `ggml_backend_buffer` to store all tensors
-    ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
-
-    // 5. Copy tensor data from main memory (RAM) to backend buffer
-    ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw));
-    ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos));
-
-    // 6. Create a `ggml_cgraph` for mul_mat operation
-    struct ggml_cgraph * gf = NULL;
-    struct ggml_context * ctx_cgraph = NULL;
-
-    // create a temporally context to build the graph
-    struct ggml_init_params params0 = {
-        /*.mem_size   =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
-    };
-    ctx_cgraph = ggml_init(params0);
-    gf = ggml_new_graph(ctx_cgraph);
-
-    struct ggml_tensor * result0 = ggml_rope_multi(
-        ctx_cgraph, inp_raw, pos, nullptr,
-        128/2, sections, LLAMA_ROPE_TYPE_VISION, 32768, 1000000, 1,
-        0, 1, 32, 1);
-
-    // Add "result" tensor and all of its dependencies to the cgraph
-    ggml_build_forward_expand(gf, result0);
-
-    // 7. Create a `ggml_gallocr` for cgraph computation
-    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
-    ggml_gallocr_alloc_graph(allocr, gf);
-
-    // 9. Run the computation
-    int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
-    if (ggml_backend_is_cpu(backend)) {
-        ggml_backend_cpu_set_n_threads(backend, n_threads);
-    }
-    ggml_backend_graph_compute(backend, gf);
-
-    // 10. Retrieve results (output tensors)
-    // in this example, output tensor is always the last tensor in the graph
-    struct ggml_tensor * result = result0;
-    // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
-    float * result_data = (float *)malloc(ggml_nbytes(result));
-    // because the tensor data is stored in device buffer, we need to copy it back to RAM
-    ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
-    const std::string bin_file = "mrope_2d_" + backend_name +".bin";
-    std::ofstream outFile(bin_file, std::ios::binary);
-
-    if (outFile.is_open()) {
-        outFile.write(reinterpret_cast<const char*>(result_data), ggml_nbytes(result));
-        outFile.close();
-        std::cout << "Data successfully written to " + bin_file << std::endl;
-    } else {
-        std::cerr << "Error opening file!" << std::endl;
-    }
-
-    free(result_data);
-    // 11. Free memory and exit
-    ggml_free(ctx_cgraph);
-    ggml_gallocr_free(allocr);
-    ggml_free(ctx);
-    ggml_backend_buffer_free(buffer);
-    ggml_backend_free(backend);
-}
-
-static void debug_dump_img_embed(struct llava_context * ctx_llava) {
-    int n_embd  = llama_n_embd(llama_get_model(ctx_llava->ctx_llama));
-    int ne = n_embd * 4;
-    float vals[56 * 56 * 3];
-    // float embd[ne];
-    std::vector<float> embd;
-    embd.resize(ne);
-
-    for (int i = 0; i < 56*56; i++)
-    {
-        for (int c = 0; c < 3; c++)
-            vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56);
-    }
-
-    clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data());
-
-    std::ofstream outFile("img_embed.bin", std::ios::binary);
-    if (outFile.is_open()) {
-        outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
-
-        outFile.close();
-        std::cout << "Data successfully written to mrope.bin" << std::endl;
-    } else {
-        std::cerr << "Error opening file!" << std::endl;
-    }
-}
-
-#endif
-
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
-        return 1;
-    }
-
-    common_init();
-
-    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        print_usage(argc, argv);
-        return 1;
-    }
-
-    auto * model = llava_init(&params);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
-        return 1;
-    }
-
-    if (prompt_contains_image(params.prompt)) {
-        auto * ctx_llava = llava_init_context(&params, model);
-
-        auto * image_embed = load_image(ctx_llava, &params, "");
-
-        // process the prompt
-        process_prompt(ctx_llava, image_embed, &params, params.prompt);
-
-        llama_perf_context_print(ctx_llava->ctx_llama);
-        llava_image_embed_free(image_embed);
-        ctx_llava->model = NULL;
-        llava_free(ctx_llava);
-#ifndef NDEBUG
-    } else if (params.image[0].empty()) {
-        auto ctx_llava = llava_init_context(&params, model);
-
-        debug_test_mrope_2d();
-        debug_dump_img_embed(ctx_llava);
-
-        llama_perf_context_print(ctx_llava->ctx_llama);
-        ctx_llava->model = NULL;
-        llava_free(ctx_llava);
-#endif
-    } else {
-        for (auto & image : params.image) {
-            auto * ctx_llava = llava_init_context(&params, model);
-
-            auto * image_embed = load_image(ctx_llava, &params, image);
-            if (!image_embed) {
-                LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
-                return 1;
-            }
-
-            // process the prompt
-            process_prompt(ctx_llava, image_embed, &params, params.prompt);
-
-            llama_perf_context_print(ctx_llava->ctx_llama);
-            llava_image_embed_free(image_embed);
-            ctx_llava->model = NULL;
-            llava_free(ctx_llava);
-        }
-    }
-
-    llama_free_model(model);
-
-    return 0;
-}
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -66,7 +66,7 @@ In this section, we cover the most commonly used options for running the `llama-
 -   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
 -   `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
 -   `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
 -   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
@@ -131,7 +131,7 @@ During text generation, LLaMA models have a limited context size, which means th

 ### Context Size

- `-c N, --ctx-size N`: Set the size of the prompt context (default: 4096, 0 = loaded from model). If a LLaMA model was built with a longer context, increasing this value will yield the best results on longer input/inference.
+- `-c N, --ctx-size N`: Set the size of the prompt context (default: 0, 0 = loaded from model). The LLaMA models were built with a context of 2048-8192, which will yield the best results on longer input/inference.

 ### Extended Context Size

@@ -177,11 +177,16 @@ Example usage: `--temp 0`

 -   `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled).
 -   `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
+-   `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.

 The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.

 The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).

+Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.
+
+Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`
+
 ### DRY Repetition Penalty

 DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns (original [PR link](https://github.com/oobabooga/text-generation-webui/pull/5677)).
@@ -343,7 +348,6 @@ These options provide extra functionality and customization when running the LLa

 -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
 -   `--verbose-prompt`: Print the prompt before generating text.
-   `--no-display-prompt`: Don't print prompt at generation.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
 -   `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable  or in an OS-specific local cache.
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -54,6 +54,8 @@ As the models are currently fully loaded into memory, you will need adequate dis

 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.

+The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
+
 *(outdated)*

 | Model | Measure      |    F16 |   Q4_0 |   Q4_1 |   Q5_0 |   Q5_1 |   Q8_0 |
@@ -81,7 +83,7 @@ Several quantization methods are supported. They differ in the resulting model d
  - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
  - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
  - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
-  - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
+  - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
  - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
  - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
  - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -48,6 +48,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_K_M",   LLAMA_FTYPE_MOSTLY_Q5_K_M,   " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
    { "Q6_K",     LLAMA_FTYPE_MOSTLY_Q6_K,     " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
    { "Q8_0",     LLAMA_FTYPE_MOSTLY_Q8_0,     " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
+    { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
+    { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
+    { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
    { "F16",      LLAMA_FTYPE_MOSTLY_F16,      "14.00G, +0.0020 ppl @ Mistral-7B",  },
    { "BF16",     LLAMA_FTYPE_MOSTLY_BF16,     "14.00G, -0.0050 ppl @ Mistral-7B",  },
    { "F32",      LLAMA_FTYPE_ALL_F32,         "26.00G              @ 7B",          },
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -107,7 +107,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
        }

        float * out = output + batch.seq_id[i][0] * n_embd;
-        common_embd_normalize(embd, out, n_embd, 2);
+        common_embd_normalize(embd, out, n_embd);
    }
 }

@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
        std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
        chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
    }
-    LOG_INF("Number of chunks: %zu\n", chunks.size());
+    LOG_INF("Number of chunks: %ld\n", chunks.size());

    llama_backend_init();
    llama_numa_init(params.numa);
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -12,10 +12,6 @@
 #include "ggml-vulkan.h"
 #endif

-#ifdef GGML_USE_SYCL
-#include "ggml-sycl.h"
-#endif
-
 #include "ggml-rpc.h"
 #ifdef _WIN32
 #  include <windows.h>
@@ -95,12 +91,6 @@ static ggml_backend_t create_backend() {
    if (!backend) {
        fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
    }
-#elif GGML_USE_SYCL
-    fprintf(stderr, "%s: using SYCL backend\n", __func__);
-    backend = ggml_backend_sycl_init(0); // init device 0
-    if (!backend) {
-        fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
-    }
 #endif

    // if there aren't GPU Backends fallback to CPU backend
@@ -116,8 +106,6 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
    ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
 #elif GGML_USE_VULKAN
    ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
-#elif GGML_USE_SYCL
-    ggml_backend_sycl_get_device_memory(0, free_mem, total_mem);
 #else
    #ifdef _WIN32
        MEMORYSTATUSEX status;
--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-run)
 add_executable(${TARGET} run.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/run/README.md
+++ b/examples/run/README.md
@@ -3,49 +3,5 @@
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.

 ```bash
-llama-run granite-code
-```
-
-```bash
-llama-run -h
-Description:
-  Runs a llm
-
-Usage:
-  llama-run [options] model [prompt]
-
-Options:
-  -c, --context-size <value>
-      Context size (default: 2048)
-  -n, --ngl <value>
-      Number of GPU layers (default: 0)
-  --temp <value>
-      Temperature (default: 0.8)
-  -v, --verbose, --log-verbose
-      Set verbosity level to infinity (i.e. log all messages, useful for debugging)
-  -h, --help
-      Show help message
-
-Commands:
-  model
-      Model is a string with an optional prefix of
-      huggingface:// (hf://), ollama://, https:// or file://.
-      If no protocol is specified and a file exists in the specified
-      path, file:// is assumed, otherwise if a file does not exist in
-      the specified path, ollama:// is assumed. Models that are being
-      pulled are downloaded with .partial extension while being
-      downloaded and then renamed as the file without the .partial
-      extension when complete.
-
-Examples:
-  llama-run llama3
-  llama-run ollama://granite-code
-  llama-run ollama://smollm:135m
-  llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf
-  llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf
-  llama-run https://example.com/some-file1.gguf
-  llama-run some-file2.gguf
-  llama-run file://some-file3.gguf
-  llama-run --ngl 999 some-file4.gguf
-  llama-run --ngl 999 some-file5.gguf Hello World
-```
+./llama-run Meta-Llama-3.1-8B-Instruct.gguf
+...
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -15,8 +15,13 @@ set(TARGET_SRCS
    httplib.h
 )
 set(PUBLIC_ASSETS
-    index.html.gz
+    index.html
+    completion.js
    loading.html
+    deps_daisyui.min.css
+    deps_markdown-it.js
+    deps_tailwindcss.js
+    deps_vue.esm-browser.js
 )

 foreach(asset ${PUBLIC_ASSETS})
@@ -28,13 +33,11 @@ foreach(asset ${PUBLIC_ASSETS})
        OUTPUT "${output}"
        COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
    )
-    set_source_files_properties(${output} PROPERTIES GENERATED TRUE)
 endforeach()

 add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)

-target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
 target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})

 if (LLAMA_SERVER_SSL)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -62,8 +62,8 @@ The project is under active development, and we are [looking for feedback and co
 | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
 | `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
 | `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
-| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
-| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
+| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
+| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
@@ -104,6 +104,7 @@ The project is under active development, and we are [looking for feedback and co
 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
 | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
+| `--penalize-nl` | penalize newline tokens (default: false) |
 | `--temp N` | temperature (default: 0.8) |
 | `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
 | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
@@ -137,7 +138,6 @@ The project is under active development, and we are [looking for feedback and co
 | -------- | ----------- |
 | `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
 | `-sp, --special` | special tokens output enabled (default: false) |
-| `--no-warmup` | skip warming up the model with an empty run |
 | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
 | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
 | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
@@ -146,7 +146,6 @@ The project is under active development, and we are [looking for feedback and co
 | `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
 | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
 | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
-| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
 | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
 | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
@@ -164,13 +163,13 @@ The project is under active development, and we are [looking for feedback and co
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
-| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
-| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
-| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
-| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
+| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16) |
+| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5) |
+| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9) |
+| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model) |
 | `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
-| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
-| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
+| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
+| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |


 Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
@@ -218,37 +217,6 @@ services:
  cmake --build build --config Release -t llama-server
  ```

-## Web UI
-
-The project includes a web-based user interface that enables interaction with the model through the `/chat/completions` endpoint.
-
-The web UI is developed using:
- `vue` framework for frontend development
- `tailwindcss` and `daisyui` for styling
- `vite` for build tooling
-
-A pre-built version is available as a single HTML file under `/public` directory.
-
-To build or to run the dev server (with hot reload):
-
-```sh
-# make sure you have nodejs installed
-cd examples/server/webui
-npm i
-
-# to run the dev server
-npm run dev
-
-# to build the public/index.html
-npm run build
-```
-
-NOTE: if you are using the vite dev server, you can change the API base URL to llama.cpp. To do that, run this code snippet in browser's console:
-
-```js
-localStorage.setItem('base', 'http://localhost:8080')
-```
-
 ## Quick Start

 To get started right away, run the following command, making sure to use the correct path for the model you have:
@@ -303,23 +271,23 @@ mkdir llama-client
 cd llama-client
 ```

-Create an index.js file and put this inside:
+Create a index.js file and put this inside:

 ```javascript
-const prompt = "Building a website can be done in 10 simple steps:"
+const prompt = `Building a website can be done in 10 simple steps:`;

-async function test() {
+async function Test() {
    let response = await fetch("http://127.0.0.1:8080/completion", {
-        method: "POST",
+        method: 'POST',
        body: JSON.stringify({
            prompt,
-            n_predict: 64,
+            n_predict: 512,
        })
    })
    console.log((await response.json()).content)
 }

-test()
+Test()
 ```

 And run it:
@@ -343,180 +311,140 @@ node index.js

 ### POST `/completion`: Given a `prompt`, it returns the predicted completion.

-> [!IMPORTANT]
->
-> This endpoint is **not** OAI-compatible
+    *Options:*

-*Options:*
+    `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true:

-`prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true:
+      - The prompt is a string or an array with the first element given as a string
+      - The model's `tokenizer.ggml.add_bos_token` metadata is `true`

-  - The prompt is a string or an array with the first element given as a string
-  - The model's `tokenizer.ggml.add_bos_token` metadata is `true`
+    These input shapes and data type are allowed for `prompt`:

-These input shapes and data type are allowed for `prompt`:
+      - Single string: `"string"`
+      - Single sequence of tokens: `[12, 34, 56]`
+      - Mixed tokens and strings: `[12, 34, "string", 56, 78]`

-  - Single string: `"string"`
-  - Single sequence of tokens: `[12, 34, 56]`
-  - Mixed tokens and strings: `[12, 34, "string", 56, 78]`
+    Multiple prompts are also supported. In this case, the completion result will be an array.

-Multiple prompts are also supported. In this case, the completion result will be an array.
+      - Only strings: `["string1", "string2"]`
+      - Strings and sequences of tokens: `["string1", [12, 34, 56]]`
+      - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string"]`

-  - Only strings: `["string1", "string2"]`
-  - Strings and sequences of tokens: `["string1", [12, 34, 56]]`
-  - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string"]`
+    `temperature`: Adjust the randomness of the generated text. Default: `0.8`

-`temperature`: Adjust the randomness of the generated text. Default: `0.8`
+    `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled.

-`dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled.
+    `dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0`

-`dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0`
+    `top_k`: Limit the next token selection to the K most probable tokens.  Default: `40`

-`top_k`: Limit the next token selection to the K most probable tokens.  Default: `40`
+    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`

-`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`
+    `min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`

-`min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`
+    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.

-`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
+    `n_indent`: Specify the minimum line indentation for the generated text in number of whitespace characters. Useful for code completion tasks. Default: `0`

-`n_indent`: Specify the minimum line indentation for the generated text in number of whitespace characters. Useful for code completion tasks. Default: `0`
+    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
+    By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.

-`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
-By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
+    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.

-`stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`.
+    `stop`: Specify a JSON array of stopping strings.
+    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`

-`stop`: Specify a JSON array of stopping strings.
-These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
+    `typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.

-`typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
+    `repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1`

-`repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1`
+    `repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.

-`repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
+    `penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true`

-`presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.
+    `presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.

-`frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
+    `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.

-`dry_multiplier`: Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled.
+    `dry_multiplier`: Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled.

-`dry_base`: Set the DRY repetition penalty base value. Default: `1.75`
+    `dry_base`: Set the DRY repetition penalty base value. Default: `1.75`

-`dry_allowed_length`: Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2`
+    `dry_allowed_length`: Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2`

-`dry_penalty_last_n`: How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size.
+    `dry_penalty_last_n`: How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size.

-`dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
+    `dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`

-`xtc_probability`: Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.
+    `xtc_probability`: Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.

-`xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
+    `xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)

-`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
+    `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.

-`mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
+    `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`

-`mirostat_eta`: Set the Mirostat learning rate, parameter eta.  Default: `0.1`
+    `mirostat_eta`: Set the Mirostat learning rate, parameter eta.  Default: `0.1`

-`grammar`: Set grammar for grammar-based sampling.  Default: no grammar
+    `grammar`: Set grammar for grammar-based sampling.  Default: no grammar

-`json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features.  Default: no JSON schema.
+    `json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features.  Default: no JSON schema.

-`seed`: Set the random number generator (RNG) seed.  Default: `-1`, which is a random seed.
+    `seed`: Set the random number generator (RNG) seed.  Default: `-1`, which is a random seed.

-`ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`
+    `ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`

-`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
+    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`

-`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`
+    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`

-`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
+    `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`

-`t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled.
+    `t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled.

-`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

-`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
+    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`

-`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
+    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`

-`return_tokens`: Return the raw generated token ids in the `tokens` field. Otherwise `tokens` remains empty. Default: `false`
+    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.

-`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
-
-`timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`
-
-`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain.
-
-`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error. Note that fields with a slash will be unnested; for example, `generation_settings/n_predict` will move the field `n_predict` from the `generation_settings` object to the root of the response and give it a new name.
+    `timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`

 **Response format**

- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
+- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.

- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has a nested array `top_logprobs`. It contains at **maximum** `n_probs` elements:
-  ```json
-  {
-    "content": "<the generated completion text>",
-    "tokens": [ generated token ids if requested ],
+- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
+
+```json
+{
+  "content": "<the token selected by the model>",
+  "probs": [
+    {
+      "prob": float,
+      "tok_str": "<most likely token>"
+    },
+    {
+      "prob": float,
+      "tok_str": "<second most likely token>"
+    },
    ...
-    "probs": [
-      {
-        "id": <token id>,
-        "logprob": float,
-        "token": "<most likely token>",
-        "bytes": [int, int, ...],
-        "top_logprobs": [
-          {
-            "id": <token id>,
-            "logprob": float,
-            "token": "<token text>",
-            "bytes": [int, int, ...],
-          },
-          {
-            "id": <token id>,
-            "logprob": float,
-            "token": "<token text>",
-            "bytes": [int, int, ...],
-          },
-          ...
-        ]
-      },
-      {
-        "id": <token id>,
-        "logprob": float,
-        "token": "<most likely token>",
-        "bytes": [int, int, ...],
-        "top_logprobs": [
-          ...
-        ]
-      },
-      ...
-    ]
-  },
-  ```
-  Please note that if `post_sampling_probs` is set to `true`:
-    - `logprob` will be replaced with `prob`, with the value between 0.0 and 1.0
-    - `top_logprobs` will be replaced with `top_probs`. Each element contains:
-      - `id`: token ID
-      - `token`: token in string
-      - `bytes`: token in bytes
-      - `prob`: token probability, with the value between 0.0 and 1.0
-    - Number of elements in `top_probs` may be less than `n_probs`
+  ]
+},
+```
+
+Notice that each `probs` is an array of length `n_probs`.

 - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
- `tokens`: Same as `content` but represented as raw token ids. Only populated if `"return_tokens": true` or `"stream": true` in the request.
 - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
 - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
- `model`: The model alias (for model path, please use `/props` endpoint)
- `prompt`: The processed `prompt` (special tokens may be added)
- `stop_type`: Indicating whether the completion has stopped. Possible values are:
-  - `none`: Generating (not stopped)
-  - `eos`: Stopped because it encountered the EOS token
-  - `limit`: Stopped because `n_predict` tokens were generated before stop words or EOS was encountered
-  - `word`: Stopped due to encountering a stopping word from `stop` JSON array provided
+- `model`: The path to the model loaded with `-m`
+- `prompt`: The provided `prompt`
+- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
+- `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
+- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
 - `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
 - `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
 - `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
@@ -525,13 +453,13 @@ These words will not be included in the completion, so make sure to add them to

 ### POST `/tokenize`: Tokenize a given text

-*Options:*
+    *Options:*

-`content`: (Required) The text to tokenize.
+    `content`: (Required) The text to tokenize.

-`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
+    `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`

-`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`
+    `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`

 **Response:**

@@ -568,52 +496,52 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k

 ### POST `/detokenize`: Convert tokens to text

-*Options:*
+    *Options:*

-`tokens`: Set the tokens to detokenize.
+    `tokens`: Set the tokens to detokenize.

 ### POST `/embedding`: Generate embedding of a given text

 The same as [the embedding example](../embedding) does.

-*Options:*
+    *Options:*

-`content`: Set the text to process.
+    `content`: Set the text to process.

-`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

 ### POST `/reranking`: Rerank documents according to a given query

 Similar to https://jina.ai/reranker/ but might change in the future.
 Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)) and the `--embedding --pooling rank` options.

-*Options:*
+    *Options:*

-`query`: The query against which the documents will be ranked.
+    `query`: The query against which the documents will be ranked.

-`documents`: An array strings representing the documents to be ranked.
+    `documents`: An array strings representing the documents to be ranked.

-*Aliases:*
-  - `/rerank`
-  - `/v1/rerank`
-  - `/v1/reranking`
+    *Aliases:*
+      - `/rerank`
+      - `/v1/rerank`
+      - `/v1/reranking`

-*Examples:*
+    *Examples:*

-```shell
-curl http://127.0.0.1:8012/v1/rerank \
-    -H "Content-Type: application/json" \
-    -d '{
-        "model": "some-model",
-            "query": "What is panda?",
-            "top_n": 3,
-            "documents": [
-                "hi",
-            "it is a bear",
-            "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
-            ]
-    }' | jq
-```
+    ```shell
+    curl http://127.0.0.1:8012/v1/rerank \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "some-model",
+                "query": "What is panda?",
+                "top_n": 3,
+                "documents": [
+                    "hi",
+                "it is a bear",
+                "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
+                ]
+        }' | jq
+    ```

 ### POST `/infill`: For code infilling.

@@ -657,83 +585,14 @@ This endpoint is public (no API key check). By default, it is read-only. To make

 ```json
 {
-  "default_generation_settings": {
-    "id": 0,
-    "id_task": -1,
-    "n_ctx": 1024,
-    "speculative": false,
-    "is_processing": false,
-    "params": {
-      "n_predict": -1,
-      "seed": 4294967295,
-      "temperature": 0.800000011920929,
-      "dynatemp_range": 0.0,
-      "dynatemp_exponent": 1.0,
-      "top_k": 40,
-      "top_p": 0.949999988079071,
-      "min_p": 0.05000000074505806,
-      "xtc_probability": 0.0,
-      "xtc_threshold": 0.10000000149011612,
-      "typical_p": 1.0,
-      "repeat_last_n": 64,
-      "repeat_penalty": 1.0,
-      "presence_penalty": 0.0,
-      "frequency_penalty": 0.0,
-      "dry_multiplier": 0.0,
-      "dry_base": 1.75,
-      "dry_allowed_length": 2,
-      "dry_penalty_last_n": -1,
-      "dry_sequence_breakers": [
-        "\n",
-        ":",
-        "\"",
-        "*"
-      ],
-      "mirostat": 0,
-      "mirostat_tau": 5.0,
-      "mirostat_eta": 0.10000000149011612,
-      "stop": [],
-      "max_tokens": -1,
-      "n_keep": 0,
-      "n_discard": 0,
-      "ignore_eos": false,
-      "stream": true,
-      "n_probs": 0,
-      "min_keep": 0,
-      "grammar": "",
-      "samplers": [
-        "dry",
-        "top_k",
-        "typ_p",
-        "top_p",
-        "min_p",
-        "xtc",
-        "temperature"
-      ],
-      "speculative.n_max": 16,
-      "speculative.n_min": 5,
-      "speculative.p_min": 0.8999999761581421,
-      "timings_per_token": false
-    },
-    "prompt": "",
-    "next_token": {
-      "has_next_token": true,
-      "has_new_line": false,
-      "n_remain": -1,
-      "n_decoded": 0,
-      "stopping_word": ""
-    }
-  },
+  "default_generation_settings": { ... },
  "total_slots": 1,
-  "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
-  "chat_template": "...",
-  "build_info": "b(build number)-(build commit hash)"
+  "chat_template": ""
 }
 ```

 - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
- `model_path` - the path to model file (same with `-m` argument)
 - `chat_template` - the model's original Jinja2 prompt template

 ### POST `/props`: Change server global properties.
@@ -748,131 +607,89 @@ To use this endpoint with POST method, you need to start server with `--props`

 Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.

-*Options:*
+    *Options:*

-See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
+    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.

-The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
+    The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.

-*Examples:*
+    *Examples:*

-You can use either Python `openai` library with appropriate checkpoints:
+    You can use either Python `openai` library with appropriate checkpoints:

-```python
-import openai
+    ```python
+    import openai

-client = openai.OpenAI(
-    base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
-    api_key = "sk-no-key-required"
-)
+    client = openai.OpenAI(
+        base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
+        api_key = "sk-no-key-required"
+    )

-completion = client.chat.completions.create(
-model="gpt-3.5-turbo",
-messages=[
-    {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
-    {"role": "user", "content": "Write a limerick about python exceptions"}
-]
-)
+    completion = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages=[
+        {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
+        {"role": "user", "content": "Write a limerick about python exceptions"}
+    ]
+    )

-print(completion.choices[0].message)
-```
+    print(completion.choices[0].message)
+    ```

-... or raw HTTP requests:
+    ... or raw HTTP requests:

-```shell
-curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer no-key" \
-d '{
-"model": "gpt-3.5-turbo",
-"messages": [
-{
-    "role": "system",
-    "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
-},
-{
-    "role": "user",
-    "content": "Write a limerick about python exceptions"
-}
-]
-}'
-```
+    ```shell
+    curl http://localhost:8080/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer no-key" \
+    -d '{
+    "model": "gpt-3.5-turbo",
+    "messages": [
+    {
+        "role": "system",
+        "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
+    },
+    {
+        "role": "user",
+        "content": "Write a limerick about python exceptions"
+    }
+    ]
+    }'
+    ```

 ### POST `/v1/embeddings`: OpenAI-compatible embeddings API

-This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
+    *Options:*

-*Options:*
+    See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).

-See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
+    *Examples:*

-*Examples:*
+  - input as string

- input as string
+    ```shell
+    curl http://localhost:8080/v1/embeddings \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer no-key" \
+    -d '{
+            "input": "hello",
+            "model":"GPT-4",
+            "encoding_format": "float"
+    }'
+    ```

-  ```shell
-  curl http://localhost:8080/v1/embeddings \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer no-key" \
-  -d '{
-          "input": "hello",
-          "model":"GPT-4",
-          "encoding_format": "float"
-  }'
-  ```
+  - `input` as string array

- `input` as string array
-
-  ```shell
-  curl http://localhost:8080/v1/embeddings \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer no-key" \
-  -d '{
-          "input": ["hello", "world"],
-          "model":"GPT-4",
-          "encoding_format": "float"
-  }'
-  ```
-
-### POST `/embeddings`: non-OpenAI-compatible embeddings API
-
-This endpoint supports all poolings, including `--pooling none`. When the pooling is `none`, the responses will contain the *unnormalized* embeddings for *all* input tokens. For all other pooling types, only the pooled embeddings are returned, normalized using Euclidian norm.
-
-Note that the response format of this endpoint is different from `/v1/embeddings`.
-
-*Options:*
-
-Same as the `/v1/embeddings` endpoint.
-
-*Examples:*
-
-Same as the `/v1/embeddings` endpoint.
-
-**Response format**
-
-```json
-[
-  {
-    "index": 0,
-    "embedding": [
-      [ ... embeddings for token 0   ... ],
-      [ ... embeddings for token 1   ... ],
-      [ ... ]
-      [ ... embeddings for token N-1 ... ],
-    ]
-  },
-  ...
-  {
-    "index": P,
-    "embedding": [
-      [ ... embeddings for token 0   ... ],
-      [ ... embeddings for token 1   ... ],
-      [ ... ]
-      [ ... embeddings for token N-1 ... ],
-    ]
-  }
-]
-```
+    ```shell
+    curl http://localhost:8080/v1/embeddings \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer no-key" \
+    -d '{
+            "input": ["hello", "world"],
+            "model":"GPT-4",
+            "encoding_format": "float"
+    }'
+    ```

 ### GET `/slots`: Returns the current slots processing state

@@ -889,73 +706,56 @@ Example:

 ```json
 [
-  {
-    "id": 0,
-    "id_task": -1,
-    "n_ctx": 1024,
-    "speculative": false,
-    "is_processing": false,
-    "params": {
-      "n_predict": -1,
-      "seed": 4294967295,
-      "temperature": 0.800000011920929,
-      "dynatemp_range": 0.0,
-      "dynatemp_exponent": 1.0,
-      "top_k": 40,
-      "top_p": 0.949999988079071,
-      "min_p": 0.05000000074505806,
-      "xtc_probability": 0.0,
-      "xtc_threshold": 0.10000000149011612,
-      "typical_p": 1.0,
-      "repeat_last_n": 64,
-      "repeat_penalty": 1.0,
-      "presence_penalty": 0.0,
-      "frequency_penalty": 0.0,
-      "dry_multiplier": 0.0,
-      "dry_base": 1.75,
-      "dry_allowed_length": 2,
-      "dry_penalty_last_n": -1,
-      "dry_sequence_breakers": [
-        "\n",
-        ":",
-        "\"",
-        "*"
-      ],
-      "mirostat": 0,
-      "mirostat_tau": 5.0,
-      "mirostat_eta": 0.10000000149011612,
-      "stop": [],
-      "max_tokens": -1,
-      "n_keep": 0,
-      "n_discard": 0,
-      "ignore_eos": false,
-      "stream": true,
-      "n_probs": 0,
-      "min_keep": 0,
-      "grammar": "",
-      "samplers": [
-        "dry",
-        "top_k",
-        "typ_p",
-        "top_p",
-        "min_p",
-        "xtc",
-        "temperature"
-      ],
-      "speculative.n_max": 16,
-      "speculative.n_min": 5,
-      "speculative.p_min": 0.8999999761581421,
-      "timings_per_token": false
-    },
-    "prompt": "",
-    "next_token": {
-      "has_next_token": true,
-      "has_new_line": false,
-      "n_remain": -1,
-      "n_decoded": 0,
-      "stopping_word": ""
+    {
+        "dynatemp_exponent": 1.0,
+        "dynatemp_range": 0.0,
+        "frequency_penalty": 0.0,
+        "grammar": "",
+        "id": 0,
+        "ignore_eos": false,
+        "is_processing": false,
+        "logit_bias": [],
+        "min_p": 0.05000000074505806,
+        "mirostat": 0,
+        "mirostat_eta": 0.10000000149011612,
+        "mirostat_tau": 5.0,
+        "model": "llama-2-7b-32k-instruct.Q2_K.gguf",
+        "n_ctx": 2048,
+        "n_keep": 0,
+        "n_predict": 100000,
+        "n_probs": 0,
+        "next_token": {
+            "has_next_token": true,
+            "n_remain": -1,
+            "n_decoded": 0,
+            "stopped_eos": false,
+            "stopped_limit": false,
+            "stopped_word": false,
+            "stopping_word": ""
+        },
+        "penalize_nl": true,
+        "presence_penalty": 0.0,
+        "prompt": "Say hello to llama.cpp",
+        "repeat_last_n": 64,
+        "repeat_penalty": 1.100000023841858,
+        "samplers": [
+            "top_k",
+            "typical_p",
+            "top_p",
+            "min_p",
+            "temperature"
+        ],
+        "seed": 42,
+        "stop": [
+            "\n"
+        ],
+        "stream": false,
+        "task_id": 0,
+        "temperature": 0.0,
+        "top_k": 40,
+        "top_p": 0.949999988079071,
+        "typical_p": 1.0
    }
-  }
 ]
 ```

@@ -975,9 +775,9 @@ Available metrics:

 ### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.

-*Options:*
+    *Options:*

-`filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
+    `filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.

 **Response format**

@@ -995,9 +795,9 @@ Available metrics:

 ### POST `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.

-*Options:*
+    *Options:*

-`filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
+    `filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.

 **Response format**

--- a/examples/server/deps.sh
+++ b/examples/server/deps.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Download and update deps for binary
+
+# get the directory of this script file
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+PUBLIC=$DIR/public
+
+echo "download js bundle files"
+
+# Note for contributors: Always pin to a specific version "maj.min.patch" to avoid breaking the CI
+
+curl -L https://cdn.tailwindcss.com/3.4.14 > $PUBLIC/deps_tailwindcss.js
+echo >> $PUBLIC/deps_tailwindcss.js # add newline
+
+curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/styled.min.css > $PUBLIC/deps_daisyui.min.css
+curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/themes.min.css >> $PUBLIC/deps_daisyui.min.css
+echo >> $PUBLIC/deps_daisyui.min.css # add newline
+
+curl -L https://unpkg.com/vue@3.5.12/dist/vue.esm-browser.js > $PUBLIC/deps_vue.esm-browser.js
+echo >> $PUBLIC/deps_vue.esm-browser.js # add newline
+
+curl -L https://cdnjs.cloudflare.com/ajax/libs/markdown-it/13.0.2/markdown-it.js > $PUBLIC/deps_markdown-it.js
+echo >> $PUBLIC/deps_markdown-it.js # add newline
+
+ls -lah $PUBLIC
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@@ -0,0 +1,225 @@
+const paramDefaults = {
+  stream: true,
+  temperature: 0.2,
+};
+
+let generation_settings = null;
+
+export class CompletionError extends Error {
+  constructor(message, name, data) {
+    super(message);
+    this.name = name;
+  }
+};
+
+// Completes the prompt as a generator. Recommended for most use cases.
+//
+// Example:
+//
+//    import { llama } from '/completion.js'
+//
+//    const request = llama("Tell me a joke", {n_predict: 800})
+//    for await (const chunk of request) {
+//      document.write(chunk.data.content)
+//    }
+//
+export async function* llama(prompt, params = {}, config = {}) {
+  let controller = config.controller;
+  const api_url = config.api_url?.replace(/\/+$/, '') || "";
+
+  if (!controller) {
+    controller = new AbortController();
+  }
+
+  const completionParams = { ...paramDefaults, ...params, prompt };
+
+  const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
+    method: 'POST',
+    body: JSON.stringify(completionParams),
+    headers: {
+      'Connection': 'keep-alive',
+      'Content-Type': 'application/json',
+      'Accept': 'text/event-stream',
+      ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
+    },
+    signal: controller.signal,
+  });
+
+  const status = response.status;
+  if (status !== 200) {
+    try {
+      const body = await response.json();
+      if (body && body.error && body.error.message) {
+        throw new CompletionError(body.error.message, 'ServerError');
+      }
+    } catch (err) {
+      throw new CompletionError(err.message, 'ServerError');
+    }
+  }
+
+  const reader = response.body.getReader();
+  const decoder = new TextDecoder();
+
+  let content = "";
+  let leftover = ""; // Buffer for partially read lines
+
+  try {
+    let cont = true;
+
+    while (cont) {
+      const result = await reader.read();
+      if (result.done) {
+        break;
+      }
+
+      // Add any leftover data to the current chunk of data
+      const text = leftover + decoder.decode(result.value);
+
+      // Check if the last character is a line break
+      const endsWithLineBreak = text.endsWith('\n');
+
+      // Split the text into lines
+      let lines = text.split('\n');
+
+      // If the text doesn't end with a line break, then the last line is incomplete
+      // Store it in leftover to be added to the next chunk of data
+      if (!endsWithLineBreak) {
+        leftover = lines.pop();
+      } else {
+        leftover = ""; // Reset leftover if we have a line break at the end
+      }
+
+      // Parse all sse events and add them to result
+      const regex = /^(\S+):\s(.*)$/gm;
+      for (const line of lines) {
+        const match = regex.exec(line);
+        if (match) {
+          result[match[1]] = match[2];
+          if (result.data === '[DONE]') {
+            cont = false;
+            break;
+          }
+
+          // since we know this is llama.cpp, let's just decode the json in data
+          if (result.data) {
+            result.data = JSON.parse(result.data);
+            content += result.data.content;
+
+            // yield
+            yield result;
+
+            // if we got a stop token from server, we will break here
+            if (result.data.stop) {
+              if (result.data.generation_settings) {
+                generation_settings = result.data.generation_settings;
+              }
+              cont = false;
+              break;
+            }
+          }
+          if (result.error) {
+            try {
+              result.error = JSON.parse(result.error);
+              if (result.error.message.includes('slot unavailable')) {
+                // Throw an error to be caught by upstream callers
+                throw new Error('slot unavailable');
+              } else {
+                console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
+              }
+            } catch(e) {
+              console.error(`llama.cpp error ${result.error}`)
+            }
+          }
+        }
+      }
+    }
+  } catch (e) {
+    if (e.name !== 'AbortError') {
+      console.error("llama error: ", e);
+    }
+    throw e;
+  }
+  finally {
+    controller.abort();
+  }
+
+  return content;
+}
+
+// Call llama, return an event target that you can subscribe to
+//
+// Example:
+//
+//    import { llamaEventTarget } from '/completion.js'
+//
+//    const conn = llamaEventTarget(prompt)
+//    conn.addEventListener("message", (chunk) => {
+//      document.write(chunk.detail.content)
+//    })
+//
+export const llamaEventTarget = (prompt, params = {}, config = {}) => {
+  const eventTarget = new EventTarget();
+  (async () => {
+    let content = "";
+    for await (const chunk of llama(prompt, params, config)) {
+      if (chunk.data) {
+        content += chunk.data.content;
+        eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
+      }
+      if (chunk.data.generation_settings) {
+        eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
+      }
+      if (chunk.data.timings) {
+        eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
+      }
+    }
+    eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
+  })();
+  return eventTarget;
+}
+
+// Call llama, return a promise that resolves to the completed text. This does not support streaming
+//
+// Example:
+//
+//     llamaPromise(prompt).then((content) => {
+//       document.write(content)
+//     })
+//
+//     or
+//
+//     const content = await llamaPromise(prompt)
+//     document.write(content)
+//
+export const llamaPromise = (prompt, params = {}, config = {}) => {
+  return new Promise(async (resolve, reject) => {
+    let content = "";
+    try {
+      for await (const chunk of llama(prompt, params, config)) {
+        content += chunk.data.content;
+      }
+      resolve(content);
+    } catch (error) {
+      reject(error);
+    }
+  });
+};
+
+/**
+ * (deprecated)
+ */
+export const llamaComplete = async (params, controller, callback) => {
+  for await (const chunk of llama(params.prompt, params, { controller })) {
+    callback(chunk);
+  }
+}
+
+// Get the model info from the server. This is useful for getting the context window and so on.
+export const llamaModelInfo = async (config = {}) => {
+  if (!generation_settings) {
+    const api_url = config.api_url?.replace(/\/+$/, '') || "";
+    const props = await fetch(`${api_url}/props`).then(r => r.json());
+    generation_settings = props.default_generation_settings;
+  }
+  return generation_settings;
+}
--- a/examples/server/public/deps_daisyui.min.css
+++ b/examples/server/public/deps_daisyui.min.css
--- a/examples/server/public/deps_markdown-it.js
+++ b/examples/server/public/deps_markdown-it.js
--- a/examples/server/public/deps_tailwindcss.js
+++ b/examples/server/public/deps_tailwindcss.js
--- a/examples/server/public/deps_vue.esm-browser.js
+++ b/examples/server/public/deps_vue.esm-browser.js
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -0,0 +1,730 @@
+<html>
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <meta name="color-scheme" content="light dark">
+  <title>🦙 llama.cpp - chat</title>
+
+  <!-- Note: dependencies can de updated using ./deps.sh script -->
+  <link href="./deps_daisyui.min.css" rel="stylesheet" type="text/css" />
+  <script src="./deps_tailwindcss.js"></script>
+  <style type="text/tailwindcss">
+    .markdown {
+      h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
+      pre {
+        @apply whitespace-pre-wrap rounded-lg p-2;
+        border: 1px solid currentColor;
+      }
+      /* TODO: fix markdown table */
+    }
+    /*
+      Note for daisyui: because we're using a subset of daisyui via CDN, many things won't be included
+      We can manually add the missing styles from https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/full.css
+    */
+    .bg-base-100 {background-color: var(--fallback-b1,oklch(var(--b1)/1))}
+    .bg-base-200 {background-color: var(--fallback-b2,oklch(var(--b2)/1))}
+    .bg-base-300 {background-color: var(--fallback-b3,oklch(var(--b3)/1))}
+    .text-base-content {color: var(--fallback-bc,oklch(var(--bc)/1))}
+    .show-on-hover {
+      @apply opacity-0 group-hover:opacity-100;
+    }
+    .btn-mini {
+      @apply cursor-pointer hover:shadow-md;
+    }
+    .chat-screen { max-width: 900px; }
+    /* because the default bubble color is quite dark, we will make a custom one using bg-base-300 */
+    .chat-bubble-base-300 {
+      --tw-bg-opacity: 1;
+      --tw-text-opacity: 1;
+      @apply bg-base-300 text-base-content;
+    }
+  </style>
+</head>
+
+<body>
+  <div id="app" class="flex flex-row opacity-0"> <!-- opacity-0 will be removed on app mounted -->
+    <!-- sidebar -->
+    <div class="flex flex-col bg-black bg-opacity-5 w-64 py-8 px-4 h-screen overflow-y-auto">
+      <h2 class="font-bold mb-4 ml-4">Conversations</h2>
+
+      <!-- list of conversations -->
+      <div :class="{
+        'btn btn-ghost justify-start': true,
+        'btn-active': messages.length === 0,
+      }" @click="newConversation">
+        + New conversation
+      </div>
+      <div v-for="conv in conversations" :class="{
+        'btn btn-ghost justify-start font-normal': true,
+        'btn-active': conv.id === viewingConvId,
+      }" @click="setViewingConv(conv.id)">
+        <span class="truncate">{{ conv.messages[0].content }}</span>
+      </div>
+      <div class="text-center text-xs opacity-40 mt-auto mx-4">
+        Conversations are saved to browser's localStorage
+      </div>
+    </div>
+
+    <div class="chat-screen flex flex-col w-screen h-screen px-8 mx-auto">
+      <!-- header -->
+      <div class="flex flex-row items-center">
+        <div class="grow text-2xl font-bold mt-8 mb-6">
+          🦙 llama.cpp - chat
+        </div>
+
+        <!-- action buttons (top right) -->
+        <div class="flex items-center">
+          <button v-if="messages.length > 0" class="btn mr-1" @click="deleteConv(viewingConvId)" :disabled="isGenerating">
+            <!-- delete conversation button -->
+            <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-trash" viewBox="0 0 16 16">
+              <path d="M5.5 5.5A.5.5 0 0 1 6 6v6a.5.5 0 0 1-1 0V6a.5.5 0 0 1 .5-.5m2.5 0a.5.5 0 0 1 .5.5v6a.5.5 0 0 1-1 0V6a.5.5 0 0 1 .5-.5m3 .5a.5.5 0 0 0-1 0v6a.5.5 0 0 0 1 0z"/>
+              <path d="M14.5 3a1 1 0 0 1-1 1H13v9a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V4h-.5a1 1 0 0 1-1-1V2a1 1 0 0 1 1-1H6a1 1 0 0 1 1-1h2a1 1 0 0 1 1 1h3.5a1 1 0 0 1 1 1zM4.118 4 4 4.059V13a1 1 0 0 0 1 1h6a1 1 0 0 0 1-1V4.059L11.882 4zM2.5 3h11V2h-11z"/>
+            </svg>
+          </button>
+          <button v-if="messages.length > 0" class="btn mr-1" @click="downloadConv(viewingConvId)" :disabled="isGenerating">
+              <!-- download conversation button -->
+              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-download" viewBox="0 0 16 16">
+                  <path d="M.5 9.9a.5.5 0 0 1 .5.5v2.5a1 1 0 0 0 1 1h12a1 1 0 0 0 1-1v-2.5a.5.5 0 0 1 1 0v2.5a2 2 0 0 1-2 2H2a2 2 0 0 1-2-2v-2.5a.5.5 0 0 1 .5-.5"/>
+                  <path d="M7.646 11.854a.5.5 0 0 0 .708 0l3-3a.5.5 0 0 0-.708-.708L8.5 10.293V1.5a.5.5 0 0 0-1 0v8.793L5.354 8.146a.5.5 0 1 0-.708.708z"/>
+            </svg>
+          </button>
+          <button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
+            <!-- edit config button -->
+            <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
+              <path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0"/>
+              <path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z"/>
+            </svg>
+          </button>
+
+          <!-- theme controller is copied from https://daisyui.com/components/theme-controller/ -->
+          <div class="dropdown dropdown-end dropdown-bottom">
+            <div tabindex="0" role="button" class="btn m-1">
+              Theme
+              <svg width="12px" height="12px" class="inline-block h-2 w-2 fill-current opacity-60" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 2048 2048">
+                <path d="M1799 349l242 241-1017 1017L7 590l242-241 775 775 775-775z"></path>
+              </svg>
+            </div>
+            <ul tabindex="0" class="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto">
+              <li>
+                <button
+                  class="btn btn-sm btn-block w-full btn-ghost justify-start"
+                  :class="{ 'btn-active': selectedTheme === 'auto' }"
+                  @click="setSelectedTheme('auto')">
+                  auto
+                </button>
+              </li>
+              <li v-for="theme in themes">
+                <input
+                  type="radio"
+                  name="theme-dropdown"
+                  class="theme-controller btn btn-sm btn-block w-full btn-ghost justify-start"
+                  :aria-label="theme"
+                  :value="theme"
+                  :checked="selectedTheme === theme"
+                  @click="setSelectedTheme(theme)" />
+              </li>
+            </ul>
+          </div>
+        </div>
+      </div>
+
+      <!-- chat messages -->
+      <div id="messages-list" class="flex flex-col grow overflow-y-auto">
+        <div class="mt-auto flex justify-center">
+          <!-- placeholder to shift the message to the bottom -->
+           {{ messages.length === 0 ? 'Send a message to start' : '' }}
+        </div>
+        <div v-for="msg in messages" class="group">
+          <div :class="{
+            'chat': true,
+            'chat-start': msg.role !== 'user',
+            'chat-end': msg.role === 'user',
+          }">
+            <div :class="{
+              'chat-bubble markdown': true,
+              'chat-bubble-base-300': msg.role !== 'user',
+            }">
+              <!-- textarea for editing message -->
+              <template v-if="editingMsg && editingMsg.id === msg.id">
+                <textarea
+                  class="textarea textarea-bordered bg-base-100 text-base-content w-96"
+                  v-model="msg.content"></textarea>
+                <br/>
+                <button class="btn btn-ghost mt-2 mr-2" @click="editingMsg = null">Cancel</button>
+                <button class="btn mt-2" @click="editUserMsgAndRegenerate(msg)">Submit</button>
+              </template>
+              <!-- render message as markdown -->
+              <vue-markdown v-else :source="msg.content" />
+            </div>
+          </div>
+
+          <!-- actions for each message -->
+          <div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
+            <!-- user message -->
+            <button v-if="msg.role === 'user'" class="badge btn-minishow-on-hover " @click="editingMsg = msg" :disabled="isGenerating">
+              ✍️ Edit
+            </button>
+            <!-- assistant message -->
+            <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
+              🔄 Regenerate
+            </button>
+            <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
+              📋 Copy
+            </button>
+          </div>
+        </div>
+
+        <!-- pending (ongoing) assistant message -->
+        <div id="pending-msg" class="chat chat-start">
+          <div v-if="pendingMsg" class="chat-bubble markdown chat-bubble-base-300">
+            <span v-if="!pendingMsg.content" class="loading loading-dots loading-md"></span>
+            <vue-markdown v-else :source="pendingMsg.content" />
+          </div>
+        </div>
+      </div>
+
+      <!-- chat input -->
+      <div class="flex flex-row items-center mt-8 mb-6">
+        <textarea
+          class="textarea textarea-bordered w-full"
+          placeholder="Type a message (Shift+Enter to add a new line)"
+          v-model="inputMsg"
+          @keydown.enter.exact.prevent="sendMessage"
+          @keydown.enter.shift.exact.prevent="inputMsg += '\n'"
+          :disabled="isGenerating"
+          id="msg-input"
+        ></textarea>
+        <button v-if="!isGenerating" class="btn btn-primary ml-2" @click="sendMessage" :disabled="inputMsg.length === 0">Send</button>
+        <button v-else class="btn btn-neutral ml-2" @click="stopGeneration">Stop</button>
+      </div>
+    </div>
+
+    <!-- modal for editing config -->
+    <dialog class="modal" :class="{'modal-open': showConfigDialog}">
+      <div class="modal-box">
+        <h3 class="text-lg font-bold mb-6">Settings</h3>
+        <div class="h-[calc(90vh-12rem)] overflow-y-auto">
+          <p class="opacity-40 mb-6">Settings below are saved in browser's localStorage</p>
+          <settings-modal-short-input :config-key="'apiKey'" :config-default="configDefault" :config-info="configInfo" v-model="config.apiKey"></settings-modal-short-input>
+          <label class="form-control mb-2">
+            <div class="label">System Message</div>
+            <textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
+          </label>
+          <template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
+            <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
+          </template>
+          <!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
+          <!-- Section: Other sampler settings -->
+          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
+            <summary class="collapse-title font-bold">Other sampler settings</summary>
+            <div class="collapse-content">
+              <!-- Samplers queue -->
+              <settings-modal-short-input label="Samplers queue" :config-key="'samplers'" :config-default="configDefault" :config-info="configInfo" v-model="config.samplers"></settings-modal-short-input>
+              <!-- Samplers -->
+              <template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
+                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
+              </template>
+            </div>
+          </details>
+          <!-- Section: Penalties settings -->
+          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
+            <summary class="collapse-title font-bold">Penalties settings</summary>
+            <div class="collapse-content">
+              <template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
+                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
+              </template>
+            </div>
+          </details>
+          <!-- Section: Advanced config -->
+          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
+            <summary class="collapse-title font-bold">Advanced config</summary>
+            <div class="collapse-content">
+              <label class="form-control mb-2">
+                <!-- Custom parameters input -->
+                <div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
+                <textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
+              </label>
+            </div>
+          </details>
+        </div>
+
+        <!-- action buttons -->
+        <div class="modal-action">
+          <button class="btn" @click="resetConfigDialog">Reset to default</button>
+          <button class="btn" @click="closeAndDiscardConfigDialog">Close</button>
+          <button class="btn btn-primary" @click="closeAndSaveConfigDialog">Save and close</button>
+        </div>
+      </div>
+    </dialog>
+  </div>
+
+  <!-- Template to be used by settings modal -->
+  <template id="settings-modal-short-input">
+    <label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
+      <!-- Show help message on hovering on the input label -->
+      <div class="dropdown dropdown-hover">
+        <div tabindex="0" role="button" class="font-bold">{{ label || configKey }}</div>
+        <div class="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
+          {{ configInfo[configKey] || '(no help message available)' }}
+        </div>
+      </div>
+      <!-- Here we forward v-model from parent to child component, see: https://stackoverflow.com/questions/47311936/v-model-and-child-components -->
+      <input type="text" class="grow" :placeholder="'Default: ' + (configDefault[configKey] || 'none')" :value="modelValue" @input="$emit('update:modelValue', $event.target.value)" />
+    </label>
+  </template>
+
+  <script src="./deps_markdown-it.js"></script>
+  <script type="module">
+    import { createApp, defineComponent, shallowRef, computed, h } from './deps_vue.esm-browser.js';
+    import { llama } from './completion.js';
+
+    // utility functions
+    const isString = (x) => !!x.toLowerCase;
+    const isNumeric = (n) => !isString(n) && !isNaN(n);
+    const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
+    const copyStr = (str) => navigator.clipboard.writeText(str);
+
+    // constants
+    const BASE_URL = localStorage.getItem('base') // for debugging
+      || (new URL('.', document.baseURI).href).toString(); // for production
+    const CONFIG_DEFAULT = {
+      // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
+      apiKey: '',
+      systemMessage: 'You are a helpful assistant.',
+      // make sure these default values are in sync with `common.h`
+      samplers: 'dkypmxt',
+      temperature: 0.8,
+      dynatemp_range: 0.0,
+      dynatemp_exponent: 1.0,
+      top_k: 40,
+      top_p: 0.95,
+      min_p: 0.05,
+      xtc_probability: 0.0,
+      xtc_threshold: 0.1,
+      typical_p: 1.0,
+      repeat_last_n: 64,
+      repeat_penalty: 1.0,
+      presence_penalty: 0.0,
+      frequency_penalty: 0.0,
+      dry_multiplier: 0.0,
+      dry_base: 1.75,
+      dry_allowed_length: 2,
+      dry_penalty_last_n: -1,
+      max_tokens: -1,
+      custom: '', // custom json-stringified object
+    };
+    const CONFIG_INFO = {
+      apiKey: 'Set the API Key if you are using --api-key option for the server.',
+      systemMessage: 'The starting message that defines how model should behave.',
+      samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
+      temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
+      dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
+      dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
+      top_k: 'Keeps only k top tokens.',
+      top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
+      min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
+      xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
+      xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
+      typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
+      repeat_last_n: 'Last n tokens to consider for penalizing repetition',
+      repeat_penalty: 'Controls the repetition of token sequences in the generated text',
+      presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
+      frequency_penalty: 'Limits tokens based on how often they appear in the output.',
+      dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
+      dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
+      dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
+      dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
+      max_tokens: 'The maximum number of token per output.',
+      custom: '', // custom json-stringified object
+    };
+    // config keys having numeric value (i.e. temperature, top_k, top_p, etc)
+    const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
+    // list of themes supported by daisyui
+    const THEMES = ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'];
+
+    // markdown support
+    const VueMarkdown = defineComponent(
+      (props) => {
+        const md = shallowRef(new markdownit({ breaks: true }));
+        const origFenchRenderer = md.value.renderer.rules.fence;
+        md.value.renderer.rules.fence = (tokens, idx, ...args) => {
+          const content = tokens[idx].content;
+          const origRendered = origFenchRenderer(tokens, idx, ...args);
+          return `<div class="relative my-4">
+            <div class="text-right sticky top-4 mb-2 mr-2 h-0">
+              <button class="badge btn-mini" onclick="copyStr(${escapeAttr(JSON.stringify(content))})">📋 Copy</button>
+            </div>
+            ${origRendered}
+          </div>`;
+        };
+        window.copyStr = copyStr;
+        const content = computed(() => md.value.render(props.source));
+        return () => h("div", { innerHTML: content.value });
+      },
+      { props: ["source"] }
+    );
+
+    // input field to be used by settings modal
+    const SettingsModalShortInput = defineComponent({
+      template: document.getElementById('settings-modal-short-input').innerHTML,
+      props: {
+        label: { type: String, required: false },
+        configKey: String,
+        configDefault: Object,
+        configInfo: Object,
+        modelValue: [Object, String, Number],
+      },
+    });
+
+    // coversations is stored in localStorage
+    // format: { [convId]: { id: string, lastModified: number, messages: [...] } }
+    // convId is a string prefixed with 'conv-'
+    const StorageUtils = {
+      // manage conversations
+      getAllConversations() {
+        const res = [];
+        for (const key in localStorage) {
+          if (key.startsWith('conv-')) {
+            res.push(JSON.parse(localStorage.getItem(key)));
+          }
+        }
+        res.sort((a, b) => b.lastModified - a.lastModified);
+        return res;
+      },
+      // can return null if convId does not exist
+      getOneConversation(convId) {
+        return JSON.parse(localStorage.getItem(convId) || 'null');
+      },
+      // if convId does not exist, create one
+      appendMsg(convId, msg) {
+        if (msg.content === null) return;
+        const conv = StorageUtils.getOneConversation(convId) || {
+          id: convId,
+          lastModified: Date.now(),
+          messages: [],
+        };
+        conv.messages.push(msg);
+        conv.lastModified = Date.now();
+        localStorage.setItem(convId, JSON.stringify(conv));
+      },
+      getNewConvId() {
+        return `conv-${Date.now()}`;
+      },
+      remove(convId) {
+        localStorage.removeItem(convId);
+      },
+      filterAndKeepMsgs(convId, predicate) {
+        const conv = StorageUtils.getOneConversation(convId);
+        if (!conv) return;
+        conv.messages = conv.messages.filter(predicate);
+        conv.lastModified = Date.now();
+        localStorage.setItem(convId, JSON.stringify(conv));
+      },
+      popMsg(convId) {
+        const conv = StorageUtils.getOneConversation(convId);
+        if (!conv) return;
+        const msg = conv.messages.pop();
+        conv.lastModified = Date.now();
+        if (conv.messages.length === 0) {
+          StorageUtils.remove(convId);
+        } else {
+          localStorage.setItem(convId, JSON.stringify(conv));
+        }
+        return msg;
+      },
+
+      // manage config
+      getConfig() {
+        const savedVal = JSON.parse(localStorage.getItem('config') || '{}');
+        // to prevent breaking changes in the future, we always provide default value for missing keys
+        return {
+          ...CONFIG_DEFAULT,
+          ...savedVal,
+        };
+      },
+      setConfig(config) {
+        localStorage.setItem('config', JSON.stringify(config));
+      },
+      getTheme() {
+        return localStorage.getItem('theme') || 'auto';
+      },
+      setTheme(theme) {
+        if (theme === 'auto') {
+          localStorage.removeItem('theme');
+        } else {
+          localStorage.setItem('theme', theme);
+        }
+      },
+    };
+
+    // scroll to bottom of chat messages
+    // if requiresNearBottom is true, only auto-scroll if user is near bottom
+    const chatScrollToBottom = (requiresNearBottom) => {
+      const msgListElem = document.getElementById('messages-list');
+      const spaceToBottom = msgListElem.scrollHeight - msgListElem.scrollTop - msgListElem.clientHeight;
+      if (!requiresNearBottom || (spaceToBottom < 100)) {
+        setTimeout(() => msgListElem.scrollTo({ top: msgListElem.scrollHeight }), 1);
+      }
+    };
+
+    const mainApp = createApp({
+      components: {
+        VueMarkdown,
+        SettingsModalShortInput,
+      },
+      data() {
+        return {
+          conversations: StorageUtils.getAllConversations(),
+          messages: [], // { id: number, role: 'user' | 'assistant', content: string }
+          viewingConvId: StorageUtils.getNewConvId(),
+          inputMsg: '',
+          isGenerating: false,
+          pendingMsg: null, // the on-going message from assistant
+          stopGeneration: () => {},
+          selectedTheme: StorageUtils.getTheme(),
+          config: StorageUtils.getConfig(),
+          showConfigDialog: false,
+          editingMsg: null,
+          // const
+          themes: THEMES,
+          configDefault: {...CONFIG_DEFAULT},
+          configInfo: {...CONFIG_INFO},
+        }
+      },
+      computed: {},
+      mounted() {
+        document.getElementById('app').classList.remove('opacity-0'); // show app
+        // scroll to the bottom when the pending message height is updated
+        const pendingMsgElem = document.getElementById('pending-msg');
+        const resizeObserver = new ResizeObserver(() => {
+          if (this.isGenerating) chatScrollToBottom(true);
+        });
+        resizeObserver.observe(pendingMsgElem);
+      },
+      methods: {
+        setSelectedTheme(theme) {
+          this.selectedTheme = theme;
+          StorageUtils.setTheme(theme);
+        },
+        newConversation() {
+          if (this.isGenerating) return;
+          this.viewingConvId = StorageUtils.getNewConvId();
+          this.editingMsg = null;
+          this.fetchMessages();
+          chatScrollToBottom();
+        },
+        setViewingConv(convId) {
+          if (this.isGenerating) return;
+          this.viewingConvId = convId;
+          this.editingMsg = null;
+          this.fetchMessages();
+          chatScrollToBottom();
+        },
+        deleteConv(convId) {
+          if (this.isGenerating) return;
+          if (window.confirm('Are you sure to delete this conversation?')) {
+            StorageUtils.remove(convId);
+            if (this.viewingConvId === convId) {
+              this.viewingConvId = StorageUtils.getNewConvId();
+              this.editingMsg = null;
+            }
+            this.fetchConversation();
+            this.fetchMessages();
+          }
+        },
+        downloadConv(convId) {
+          const conversation = StorageUtils.getOneConversation(convId);
+          if (!conversation) {
+            alert('Conversation not found.');
+            return;
+          }
+          const conversationJson = JSON.stringify(conversation, null, 2);
+          const blob = new Blob([conversationJson], { type: 'application/json' });
+          const url = URL.createObjectURL(blob);
+          const a = document.createElement('a');
+          a.href = url;
+          a.download = `conversation_${convId}.json`;
+          document.body.appendChild(a);
+          a.click();
+          document.body.removeChild(a);
+          URL.revokeObjectURL(url);
+        },
+        async sendMessage() {
+          if (!this.inputMsg) return;
+          const currConvId = this.viewingConvId;
+
+          StorageUtils.appendMsg(currConvId, {
+            id: Date.now(),
+            role: 'user',
+            content: this.inputMsg,
+          });
+          this.fetchConversation();
+          this.fetchMessages();
+          this.inputMsg = '';
+          this.editingMsg = null;
+          this.generateMessage(currConvId);
+          chatScrollToBottom();
+        },
+        async generateMessage(currConvId) {
+          if (this.isGenerating) return;
+          this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
+          this.isGenerating = true;
+          this.editingMsg = null;
+
+          try {
+            const abortController = new AbortController();
+            this.stopGeneration = () => abortController.abort();
+            const params = {
+              messages: [
+                { role: 'system', content: this.config.systemMessage },
+                ...this.messages,
+              ],
+              stream: true,
+              cache_prompt: true,
+              samplers: this.config.samplers,
+              temperature: this.config.temperature,
+              dynatemp_range: this.config.dynatemp_range,
+              dynatemp_exponent: this.config.dynatemp_exponent,
+              top_k: this.config.top_k,
+              top_p: this.config.top_p,
+              min_p: this.config.min_p,
+              typical_p: this.config.typical_p,
+              xtc_probability: this.config.xtc_probability,
+              xtc_threshold: this.config.xtc_threshold,
+              repeat_last_n: this.config.repeat_last_n,
+              repeat_penalty: this.config.repeat_penalty,
+              presence_penalty: this.config.presence_penalty,
+              frequency_penalty: this.config.frequency_penalty,
+              dry_multiplier: this.config.dry_multiplier,
+              dry_base: this.config.dry_base,
+              dry_allowed_length: this.config.dry_allowed_length,
+              dry_penalty_last_n: this.config.dry_penalty_last_n,
+              max_tokens: this.config.max_tokens,
+              ...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
+              ...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),
+            };
+            const config = {
+              controller: abortController,
+              api_url: BASE_URL,
+              endpoint: '/chat/completions',
+            };
+            for await (const chunk of llama(prompt, params, config)) {
+              const stop = chunk.data.stop;
+              const addedContent = chunk.data.choices[0].delta.content;
+              const lastContent = this.pendingMsg.content || '';
+              if (addedContent) {
+                this.pendingMsg = {
+                  id: this.pendingMsg.id,
+                  role: 'assistant',
+                  content: lastContent + addedContent,
+                };
+              }
+            }
+
+            StorageUtils.appendMsg(currConvId, this.pendingMsg);
+            this.fetchConversation();
+            this.fetchMessages();
+            setTimeout(() => document.getElementById('msg-input').focus(), 1);
+          } catch (error) {
+            if (error.name === 'AbortError') {
+              // user stopped the generation via stopGeneration() function
+              StorageUtils.appendMsg(currConvId, this.pendingMsg);
+              this.fetchConversation();
+              this.fetchMessages();
+            } else {
+              console.error(error);
+              alert(error);
+              // pop last user message
+              const lastUserMsg = StorageUtils.popMsg(currConvId);
+              this.inputMsg = lastUserMsg ? lastUserMsg.content : '';
+            }
+          }
+
+          this.pendingMsg = null;
+          this.isGenerating = false;
+          this.stopGeneration = () => {};
+          this.fetchMessages();
+          chatScrollToBottom();
+        },
+
+        // message actions
+        regenerateMsg(msg) {
+          if (this.isGenerating) return;
+          // TODO: somehow keep old history (like how ChatGPT has different "tree"). This can be done by adding "sub-conversations" with "subconv-" prefix, and new message will have a list of subconvIds
+          const currConvId = this.viewingConvId;
+          StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
+          this.fetchConversation();
+          this.fetchMessages();
+          this.generateMessage(currConvId);
+        },
+        copyMsg(msg) {
+          copyStr(msg.content);
+        },
+        editUserMsgAndRegenerate(msg) {
+          if (this.isGenerating) return;
+          const currConvId = this.viewingConvId;
+          const newContent = msg.content;
+          this.editingMsg = null;
+          StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
+          StorageUtils.appendMsg(currConvId, {
+            id: Date.now(),
+            role: 'user',
+            content: newContent,
+          });
+          this.fetchConversation();
+          this.fetchMessages();
+          this.generateMessage(currConvId);
+        },
+
+        // settings dialog methods
+        closeAndSaveConfigDialog() {
+          try {
+            if (this.config.custom.length) JSON.parse(this.config.custom);
+          } catch (error) {
+            alert('Invalid JSON for custom config. Please either fix it or leave it empty.');
+            return;
+          }
+          for (const key of CONFIG_NUMERIC_KEYS) {
+            if (isNaN(this.config[key]) || this.config[key].toString().trim().length === 0) {
+              alert(`Invalid number for ${key} (expected an integer or a float)`);
+              return;
+            }
+            this.config[key] = parseFloat(this.config[key]);
+          }
+          this.showConfigDialog = false;
+          StorageUtils.setConfig(this.config);
+        },
+        closeAndDiscardConfigDialog() {
+          this.showConfigDialog = false;
+          this.config = StorageUtils.getConfig();
+        },
+        resetConfigDialog() {
+          if (window.confirm('Are you sure to reset all settings?')) {
+            this.config = {...CONFIG_DEFAULT};
+          }
+        },
+
+        // sync state functions
+        fetchConversation() {
+          this.conversations = StorageUtils.getAllConversations();
+        },
+        fetchMessages() {
+          this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? [];
+        },
+      },
+    });
+    mainApp.config.errorHandler = alert;
+    try {
+      mainApp.mount('#app');
+    } catch (err) {
+      console.error(err);
+      document.getElementById('app').innerHTML = `<div style="margin:2em auto">
+        Failed to start app. Please try clearing localStorage and try again.<br/>
+        <br/>
+        <button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
+      </div>`;
+    }
+  </script>
+</body>
+
+</html>
--- a/examples/server/public/index.html.gz
+++ b/examples/server/public/index.html.gz
--- a/examples/server/public_legacy/index-new.html
+++ b/examples/server/public_legacy/index-new.html
@@ -39,6 +39,7 @@
      temperature: 0.8, // adapt all following parameters to optimized min-p requierements. If for non-english, set to 0.6 or lower
      repeat_last_n: 0, // 0 = disable penalty, -1 = context size
      repeat_penalty: 1.0, // 1.0 = disabled
+      penalize_nl: false, // true only useful for infinite completion
      dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
      dry_base: 1.75,     // 0.0 = disabled
      dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well
--- a/examples/server/public_legacy/index.html
+++ b/examples/server/public_legacy/index.html
@@ -303,6 +303,7 @@
      temperature: 0.7,
      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
      repeat_penalty: 1.18, // 1.0 = disabled
+      penalize_nl: false,
      dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
      dry_base: 1.75,     // 0.0 = disabled
      dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well
@@ -1005,6 +1006,7 @@
            ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
+            ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
            ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
            ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
--- a/examples/server/public_simplechat/simplechat.js
+++ b/examples/server/public_simplechat/simplechat.js
@@ -407,9 +407,6 @@ class SimpleChat {
                if (curLine.startsWith("data:")) {
                    curLine = curLine.substring(5);
                }
-                if (curLine.trim() === "[DONE]") {
-                    break;
-                }
                let curJson = JSON.parse(curLine);
                console.debug("DBUG:SC:PART:Json:", curJson);
                this.append_response(this.response_extract_stream(curJson, apiEP));
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -44,10 +44,4 @@ To run with stdout/stderr display in real time (verbose output, but useful for d
 DEBUG=1 ./tests.sh -s -v -x
 ```

-Hint: You can compile and run test in single command, useful for local developement:
-
-```shell
-cmake --build build -j --target llama-server && ./examples/server/tests/tests.sh
-```
-
 To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html)
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@@ -1,9 +1,5 @@
 #!/bin/bash

-# make sure we are in the right directory
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-cd $SCRIPT_DIR
-
 set -eu

 if [ $# -lt 1 ]
--- a/examples/server/tests/unit/test_basic.py
+++ b/examples/server/tests/unit/test_basic.py
@@ -1,5 +1,4 @@
 import pytest
-import requests
 from utils import *

 server = ServerPreset.tinyllama2()
@@ -23,12 +22,7 @@ def test_server_props():
    server.start()
    res = server.make_request("GET", "/props")
    assert res.status_code == 200
-    assert ".gguf" in res.body["model_path"]
    assert res.body["total_slots"] == server.n_slots
-    default_val = res.body["default_generation_settings"]
-    assert server.n_ctx is not None and server.n_slots is not None
-    assert default_val["n_ctx"] == server.n_ctx / server.n_slots
-    assert default_val["params"]["seed"] == server.seed


 def test_server_models():
@@ -39,31 +33,6 @@ def test_server_models():
    assert len(res.body["data"]) == 1
    assert res.body["data"][0]["id"] == server.model_alias

-
-def test_server_slots():
-    global server
-
-    # without slots endpoint enabled, this should return error
-    server.server_slots = False
-    server.start()
-    res = server.make_request("GET", "/slots")
-    assert res.status_code == 501 # ERROR_TYPE_NOT_SUPPORTED
-    assert "error" in res.body
-    server.stop()
-
-    # with slots endpoint enabled, this should return slots info
-    server.server_slots = True
-    server.n_slots = 2
-    server.start()
-    res = server.make_request("GET", "/slots")
-    assert res.status_code == 200
-    assert len(res.body) == server.n_slots
-    assert server.n_ctx is not None and server.n_slots is not None
-    assert res.body[0]["n_ctx"] == server.n_ctx / server.n_slots
-    assert "params" in res.body[0]
-    assert res.body[0]["params"]["seed"] == server.seed
-
-
 def test_load_split_model():
    global server
    server.model_hf_repo = "ggml-org/models"
@@ -77,20 +46,3 @@ def test_load_split_model():
    })
    assert res.status_code == 200
    assert match_regex("(little|girl)+", res.body["content"])
-
-
-def test_no_webui():
-    global server
-    # default: webui enabled
-    server.start()
-    url = f"http://{server.server_host}:{server.server_port}"
-    res = requests.get(url)
-    assert res.status_code == 200
-    assert "<html>" in res.text
-    server.stop()
-
-    # with --no-webui
-    server.no_webui = True
-    server.start()
-    res = requests.get(url)
-    assert res.status_code == 404
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -12,13 +12,13 @@ def create_server():


@pytest.mark.parametrize(
-    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
    [
-        (None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
-        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
+        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
    ]
 )
-def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
+def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
    global server
    server.start()
    res = server.make_request("POST", "/chat/completions", data={
@@ -30,29 +30,29 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
        ],
    })
    assert res.status_code == 200
-    assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
-    assert res.body["system_fingerprint"].startswith("b")
-    assert res.body["model"] == model if model is not None else server.model_alias
    assert res.body["usage"]["prompt_tokens"] == n_prompt
    assert res.body["usage"]["completion_tokens"] == n_predicted
    choice = res.body["choices"][0]
    assert "assistant" == choice["message"]["role"]
    assert match_regex(re_content, choice["message"]["content"])
-    assert choice["finish_reason"] == finish_reason
+    if truncated:
+        assert choice["finish_reason"] == "length"
+    else:
+        assert choice["finish_reason"] == "stop"


@pytest.mark.parametrize(
-    "system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
    [
-        ("Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
-        ("You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
+        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
    ]
 )
-def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
+def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
    global server
-    server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL
    server.start()
    res = server.make_stream_request("POST", "/chat/completions", data={
+        "model": model,
        "max_tokens": max_tokens,
        "messages": [
            {"role": "system", "content": system_prompt},
@@ -61,20 +61,18 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
        "stream": True,
    })
    content = ""
-    last_cmpl_id = None
    for data in res:
        choice = data["choices"][0]
-        assert data["system_fingerprint"].startswith("b")
-        assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
-        if last_cmpl_id is None:
-            last_cmpl_id = data["id"]
-        assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream
        if choice["finish_reason"] in ["stop", "length"]:
            assert data["usage"]["prompt_tokens"] == n_prompt
            assert data["usage"]["completion_tokens"] == n_predicted
            assert "content" not in choice["delta"]
            assert match_regex(re_content, content)
-            assert choice["finish_reason"] == finish_reason
+            # FIXME: not sure why this is incorrect in stream mode
+            # if truncated:
+            #   assert choice["finish_reason"] == "length"
+            # else:
+            #   assert choice["finish_reason"] == "stop"
        else:
            assert choice["finish_reason"] is None
            content += choice["delta"]["content"]
@@ -94,8 +92,8 @@ def test_chat_completion_with_openai_library():
        seed=42,
        temperature=0.8,
    )
-    assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
-    assert res.choices[0].finish_reason == "length"
+    print(res)
+    assert res.choices[0].finish_reason == "stop"
    assert res.choices[0].message.content is not None
    assert match_regex("(Suddenly)+", res.choices[0].message.content)

@@ -165,64 +163,3 @@ def test_chat_completion_with_timings_per_token():
        assert "predicted_per_second" in data["timings"]
        assert "predicted_n" in data["timings"]
        assert data["timings"]["predicted_n"] <= 10
-
-
-def test_logprobs():
-    global server
-    server.start()
-    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
-    res = client.chat.completions.create(
-        model="gpt-3.5-turbo-instruct",
-        temperature=0.0,
-        messages=[
-            {"role": "system", "content": "Book"},
-            {"role": "user", "content": "What is the best book"},
-        ],
-        max_tokens=5,
-        logprobs=True,
-        top_logprobs=10,
-    )
-    output_text = res.choices[0].message.content
-    aggregated_text = ''
-    assert res.choices[0].logprobs is not None
-    assert res.choices[0].logprobs.content is not None
-    for token in res.choices[0].logprobs.content:
-        aggregated_text += token.token
-        assert token.logprob <= 0.0
-        assert token.bytes is not None
-        assert len(token.top_logprobs) > 0
-    assert aggregated_text == output_text
-
-
-def test_logprobs_stream():
-    global server
-    server.start()
-    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
-    res = client.chat.completions.create(
-        model="gpt-3.5-turbo-instruct",
-        temperature=0.0,
-        messages=[
-            {"role": "system", "content": "Book"},
-            {"role": "user", "content": "What is the best book"},
-        ],
-        max_tokens=5,
-        logprobs=True,
-        top_logprobs=10,
-        stream=True,
-    )
-    output_text = ''
-    aggregated_text = ''
-    for data in res:
-        choice = data.choices[0]
-        if choice.finish_reason is None:
-            if choice.delta.content:
-                output_text += choice.delta.content
-            assert choice.logprobs is not None
-            assert choice.logprobs.content is not None
-            for token in choice.logprobs.content:
-                aggregated_text += token.token
-                assert token.logprob <= 0.0
-                assert token.bytes is not None
-                assert token.top_logprobs is not None
-                assert len(token.top_logprobs) > 0
-    assert aggregated_text == output_text
--- a/examples/server/tests/unit/test_completion.py
+++ b/examples/server/tests/unit/test_completion.py
@@ -10,29 +10,22 @@ def create_server():
    global server
    server = ServerPreset.tinyllama2()

-@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [
-    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False),
-    ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True),
+@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
+    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
+    ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
 ])
-def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool):
+def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
    global server
    server.start()
    res = server.make_request("POST", "/completion", data={
        "n_predict": n_predict,
        "prompt": prompt,
-        "return_tokens": return_tokens,
    })
    assert res.status_code == 200
    assert res.body["timings"]["prompt_n"] == n_prompt
    assert res.body["timings"]["predicted_n"] == n_predicted
    assert res.body["truncated"] == truncated
-    assert type(res.body["has_new_line"]) == bool
    assert match_regex(re_content, res.body["content"])
-    if return_tokens:
-        assert len(res.body["tokens"]) > 0
-        assert all(type(tok) == int for tok in res.body["tokens"])
-    else:
-        assert res.body["tokens"] == []


@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
@@ -49,42 +42,15 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp
    })
    content = ""
    for data in res:
-        assert "stop" in data and type(data["stop"]) == bool
        if data["stop"]:
            assert data["timings"]["prompt_n"] == n_prompt
            assert data["timings"]["predicted_n"] == n_predicted
            assert data["truncated"] == truncated
-            assert data["stop_type"] == "limit"
-            assert type(data["has_new_line"]) == bool
-            assert "generation_settings" in data
-            assert server.n_predict is not None
-            assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict)
-            assert data["generation_settings"]["seed"] == server.seed
            assert match_regex(re_content, content)
        else:
-            assert len(data["tokens"]) > 0
-            assert all(type(tok) == int for tok in data["tokens"])
            content += data["content"]


-def test_completion_stream_vs_non_stream():
-    global server
-    server.start()
-    res_stream = server.make_stream_request("POST", "/completion", data={
-        "n_predict": 8,
-        "prompt": "I believe the meaning of life is",
-        "stream": True,
-    })
-    res_non_stream = server.make_request("POST", "/completion", data={
-        "n_predict": 8,
-        "prompt": "I believe the meaning of life is",
-    })
-    content_stream = ""
-    for data in res_stream:
-        content_stream += data["content"]
-    assert content_stream == res_non_stream.body["content"]
-
-
@pytest.mark.parametrize("n_slots", [1, 2])
 def test_consistent_result_same_seed(n_slots: int):
    global server
@@ -95,7 +61,7 @@ def test_consistent_result_same_seed(n_slots: int):
        res = server.make_request("POST", "/completion", data={
            "prompt": "I believe the meaning of life is",
            "seed": 42,
-            "temperature": 0.0,
+            "temperature": 1.0,
            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
        })
        if last_res is not None:
@@ -120,10 +86,9 @@ def test_different_result_different_seed(n_slots: int):
            assert res.body["content"] != last_res.body["content"]
        last_res = res

-# TODO figure why it don't work with temperature = 1
-# @pytest.mark.parametrize("temperature", [0.0, 1.0])
+
@pytest.mark.parametrize("n_batch", [16, 32])
-@pytest.mark.parametrize("temperature", [0.0])
+@pytest.mark.parametrize("temperature", [0.0, 1.0])
 def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
    global server
    server.n_batch = n_batch
@@ -256,117 +221,3 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int):
        assert len(res.body["content"]) > 10
        # FIXME: the result is not deterministic when using other slot than slot 0
        # assert match_regex(re_content, res.body["content"])
-
-
-@pytest.mark.parametrize(
-    "prompt,n_predict,response_fields",
-    [
-        ("I believe the meaning of life is", 8, []),
-        ("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]),
-    ],
-)
-def test_completion_response_fields(
-    prompt: str, n_predict: int, response_fields: list[str]
-):
-    global server
-    server.start()
-    res = server.make_request(
-        "POST",
-        "/completion",
-        data={
-            "n_predict": n_predict,
-            "prompt": prompt,
-            "response_fields": response_fields,
-        },
-    )
-    assert res.status_code == 200
-    assert "content" in res.body
-    assert len(res.body["content"])
-    if len(response_fields):
-        assert res.body["generation_settings/n_predict"] == n_predict
-        assert res.body["prompt"] == "<s> " + prompt
-        assert isinstance(res.body["content"], str)
-        assert len(res.body) == len(response_fields)
-    else:
-        assert len(res.body)
-        assert "generation_settings" in res.body
-
-
-def test_n_probs():
-    global server
-    server.start()
-    res = server.make_request("POST", "/completion", data={
-        "prompt": "I believe the meaning of life is",
-        "n_probs": 10,
-        "temperature": 0.0,
-        "n_predict": 5,
-    })
-    assert res.status_code == 200
-    assert "completion_probabilities" in res.body
-    assert len(res.body["completion_probabilities"]) == 5
-    for tok in res.body["completion_probabilities"]:
-        assert "id" in tok and tok["id"] > 0
-        assert "token" in tok and type(tok["token"]) == str
-        assert "logprob" in tok and tok["logprob"] <= 0.0
-        assert "bytes" in tok and type(tok["bytes"]) == list
-        assert len(tok["top_logprobs"]) == 10
-        for prob in tok["top_logprobs"]:
-            assert "id" in prob and prob["id"] > 0
-            assert "token" in prob and type(prob["token"]) == str
-            assert "logprob" in prob and prob["logprob"] <= 0.0
-            assert "bytes" in prob and type(prob["bytes"]) == list
-
-
-def test_n_probs_stream():
-    global server
-    server.start()
-    res = server.make_stream_request("POST", "/completion", data={
-        "prompt": "I believe the meaning of life is",
-        "n_probs": 10,
-        "temperature": 0.0,
-        "n_predict": 5,
-        "stream": True,
-    })
-    for data in res:
-        if data["stop"] == False:
-            assert "completion_probabilities" in data
-            assert len(data["completion_probabilities"]) == 1
-            for tok in data["completion_probabilities"]:
-                assert "id" in tok and tok["id"] > 0
-                assert "token" in tok and type(tok["token"]) == str
-                assert "logprob" in tok and tok["logprob"] <= 0.0
-                assert "bytes" in tok and type(tok["bytes"]) == list
-                assert len(tok["top_logprobs"]) == 10
-                for prob in tok["top_logprobs"]:
-                    assert "id" in prob and prob["id"] > 0
-                    assert "token" in prob and type(prob["token"]) == str
-                    assert "logprob" in prob and prob["logprob"] <= 0.0
-                    assert "bytes" in prob and type(prob["bytes"]) == list
-
-
-def test_n_probs_post_sampling():
-    global server
-    server.start()
-    res = server.make_request("POST", "/completion", data={
-        "prompt": "I believe the meaning of life is",
-        "n_probs": 10,
-        "temperature": 0.0,
-        "n_predict": 5,
-        "post_sampling_probs": True,
-    })
-    assert res.status_code == 200
-    assert "completion_probabilities" in res.body
-    assert len(res.body["completion_probabilities"]) == 5
-    for tok in res.body["completion_probabilities"]:
-        assert "id" in tok and tok["id"] > 0
-        assert "token" in tok and type(tok["token"]) == str
-        assert "prob" in tok and 0.0 < tok["prob"] <= 1.0
-        assert "bytes" in tok and type(tok["bytes"]) == list
-        assert len(tok["top_probs"]) == 10
-        for prob in tok["top_probs"]:
-            assert "id" in prob and prob["id"] > 0
-            assert "token" in prob and type(prob["token"]) == str
-            assert "prob" in prob and 0.0 <= prob["prob"] <= 1.0
-            assert "bytes" in prob and type(prob["bytes"]) == list
-        # because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
-        assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
--- a/examples/server/tests/unit/test_embedding.py
+++ b/examples/server/tests/unit/test_embedding.py
@@ -1,5 +1,3 @@
-import base64
-import struct
 import pytest
 from openai import OpenAI
 from utils import *
@@ -16,9 +14,8 @@ def create_server():

 def test_embedding_single():
    global server
-    server.pooling = 'last'
    server.start()
-    res = server.make_request("POST", "/v1/embeddings", data={
+    res = server.make_request("POST", "/embeddings", data={
        "input": "I believe the meaning of life is",
    })
    assert res.status_code == 200
@@ -32,9 +29,8 @@ def test_embedding_single():

 def test_embedding_multiple():
    global server
-    server.pooling = 'last'
    server.start()
-    res = server.make_request("POST", "/v1/embeddings", data={
+    res = server.make_request("POST", "/embeddings", data={
        "input": [
            "I believe the meaning of life is",
            "Write a joke about AI from a very long prompt which will not be truncated",
@@ -49,72 +45,10 @@ def test_embedding_multiple():
        assert len(d['embedding']) > 1


-@pytest.mark.parametrize(
-    "input,is_multi_prompt",
-    [
-        # do not crash on empty input
-        ("", False),
-        # single prompt
-        ("string", False),
-        ([12, 34, 56], False),
-        ([12, 34, "string", 56, 78], False),
-        # multiple prompts
-        (["string1", "string2"], True),
-        (["string1", [12, 34, 56]], True),
-        ([[12, 34, 56], [12, 34, 56]], True),
-        ([[12, 34, 56], [12, "string", 34, 56]], True),
-    ]
-)
-def test_embedding_mixed_input(input, is_multi_prompt: bool):
-    global server
-    server.start()
-    res = server.make_request("POST", "/v1/embeddings", data={"input": input})
-    assert res.status_code == 200
-    data = res.body['data']
-    if is_multi_prompt:
-        assert len(data) == len(input)
-        for d in data:
-            assert 'embedding' in d
-            assert len(d['embedding']) > 1
-    else:
-        assert 'embedding' in data[0]
-        assert len(data[0]['embedding']) > 1
-
-
-def test_embedding_pooling_none():
-    global server
-    server.pooling = 'none'
-    server.start()
-    res = server.make_request("POST", "/embeddings", data={
-        "input": "hello hello hello",
-    })
-    assert res.status_code == 200
-    assert 'embedding' in res.body[0]
-    assert len(res.body[0]['embedding']) == 5 # 3 text tokens + 2 special
-
-    # make sure embedding vector is not normalized
-    for x in res.body[0]['embedding']:
-        assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON
-
-
-def test_embedding_pooling_none_oai():
-    global server
-    server.pooling = 'none'
-    server.start()
-    res = server.make_request("POST", "/v1/embeddings", data={
-        "input": "hello hello hello",
-    })
-
-    # /v1/embeddings does not support pooling type 'none'
-    assert res.status_code == 400
-    assert "error" in res.body
-
-
 def test_embedding_openai_library_single():
    global server
-    server.pooling = 'last'
    server.start()
-    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
    res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is")
    assert len(res.data) == 1
    assert len(res.data[0].embedding) > 1
@@ -122,9 +56,8 @@ def test_embedding_openai_library_single():

 def test_embedding_openai_library_multiple():
    global server
-    server.pooling = 'last'
    server.start()
-    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
    res = client.embeddings.create(model="text-embedding-3-small", input=[
        "I believe the meaning of life is",
        "Write a joke about AI from a very long prompt which will not be truncated",
@@ -138,9 +71,8 @@ def test_embedding_openai_library_multiple():

 def test_embedding_error_prompt_too_long():
    global server
-    server.pooling = 'last'
    server.start()
-    res = server.make_request("POST", "/v1/embeddings", data={
+    res = server.make_request("POST", "/embeddings", data={
        "input": "This is a test " * 512,
    })
    assert res.status_code != 200
@@ -148,9 +80,8 @@ def test_embedding_error_prompt_too_long():


 def test_same_prompt_give_same_result():
-    server.pooling = 'last'
    server.start()
-    res = server.make_request("POST", "/v1/embeddings", data={
+    res = server.make_request("POST", "/embeddings", data={
        "input": [
            "I believe the meaning of life is",
            "I believe the meaning of life is",
@@ -166,72 +97,3 @@ def test_same_prompt_give_same_result():
        vi = res.body['data'][i]['embedding']
        for x, y in zip(v0, vi):
            assert abs(x - y) < EPSILON
-
-
-@pytest.mark.parametrize(
-    "content,n_tokens",
-    [
-        ("I believe the meaning of life is", 9),
-        ("This is a test", 6),
-    ]
-)
-def test_embedding_usage_single(content, n_tokens):
-    global server
-    server.start()
-    res = server.make_request("POST", "/v1/embeddings", data={"input": content})
-    assert res.status_code == 200
-    assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
-    assert res.body['usage']['prompt_tokens'] == n_tokens
-
-
-def test_embedding_usage_multiple():
-    global server
-    server.start()
-    res = server.make_request("POST", "/v1/embeddings", data={
-        "input": [
-            "I believe the meaning of life is",
-            "I believe the meaning of life is",
-        ],
-    })
-    assert res.status_code == 200
-    assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
-    assert res.body['usage']['prompt_tokens'] == 2 * 9
-
-
-def test_embedding_openai_library_base64():
-    server.start()
-    test_input = "Test base64 embedding output"
-
-    # get embedding in default format
-    res = server.make_request("POST", "/v1/embeddings", data={
-        "input": test_input
-    })
-    assert res.status_code == 200
-    vec0 = res.body["data"][0]["embedding"]
-
-    # get embedding in base64 format
-    res = server.make_request("POST", "/v1/embeddings", data={
-        "input": test_input,
-        "encoding_format": "base64"
-    })
-
-    assert res.status_code == 200
-    assert "data" in res.body
-    assert len(res.body["data"]) == 1
-
-    embedding_data = res.body["data"][0]
-    assert "embedding" in embedding_data
-    assert isinstance(embedding_data["embedding"], str)
-
-    # Verify embedding is valid base64
-    decoded = base64.b64decode(embedding_data["embedding"])
-    # Verify decoded data can be converted back to float array
-    float_count = len(decoded) // 4  # 4 bytes per float
-    floats = struct.unpack(f'{float_count}f', decoded)
-    assert len(floats) > 0
-    assert all(isinstance(x, float) for x in floats)
-    assert len(floats) == len(vec0)
-
-    # make sure the decoded data is the same as the original
-    for x, y in zip(floats, vec0):
-        assert abs(x - y) < EPSILON
--- a/examples/server/tests/unit/test_infill.py
+++ b/examples/server/tests/unit/test_infill.py
@@ -13,28 +13,28 @@ def test_infill_without_input_extra():
    global server
    server.start()
    res = server.make_request("POST", "/infill", data={
-        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
-        "prompt": "    int n_threads = llama_",
+        "prompt": "Complete this",
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
        "input_suffix": "}\n",
    })
    assert res.status_code == 200
-    assert match_regex("(Ann|small|shiny)+", res.body["content"])
+    assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"])


 def test_infill_with_input_extra():
    global server
    server.start()
    res = server.make_request("POST", "/infill", data={
+        "prompt": "Complete this",
        "input_extra": [{
            "filename": "llama.h",
            "text": "LLAMA_API int32_t llama_n_threads();\n"
        }],
-        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
-        "prompt": "    int n_threads = llama_",
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
        "input_suffix": "}\n",
    })
    assert res.status_code == 200
-    assert match_regex("(Dad|excited|park)+", res.body["content"])
+    assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"])


@pytest.mark.parametrize("input_extra", [
@@ -48,30 +48,10 @@ def test_invalid_input_extra_req(input_extra):
    global server
    server.start()
    res = server.make_request("POST", "/infill", data={
+        "prompt": "Complete this",
        "input_extra": [input_extra],
-        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
-        "prompt": "    int n_threads = llama_",
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
        "input_suffix": "}\n",
    })
    assert res.status_code == 400
    assert "error" in res.body
-
-
-@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
-def test_with_qwen_model():
-    global server
-    server.model_file = None
-    server.model_hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-IQ3_XXS-GGUF"
-    server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf"
-    server.start(timeout_seconds=600)
-    res = server.make_request("POST", "/infill", data={
-        "input_extra": [{
-            "filename": "llama.h",
-            "text": "LLAMA_API int32_t llama_n_threads();\n"
-        }],
-        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
-        "prompt": "    int n_threads = llama_",
-        "input_suffix": "}\n",
-    })
-    assert res.status_code == 200
-    assert res.body["content"] == "n_threads();\n    printf(\"Number of threads: %d\\n\", n_threads);\n    return 0;\n"
--- a/examples/server/tests/unit/test_rerank.py
+++ b/examples/server/tests/unit/test_rerank.py
@@ -53,26 +53,3 @@ def test_invalid_rerank_req(documents):
    })
    assert res.status_code == 400
    assert "error" in res.body
-
-
-@pytest.mark.parametrize(
-    "query,doc1,doc2,n_tokens",
-    [
-        ("Machine learning is", "A machine", "Learning is", 19),
-        ("Which city?", "Machine learning is ", "Paris, capitale de la", 26),
-    ]
-)
-def test_rerank_usage(query, doc1, doc2, n_tokens):
-    global server
-    server.start()
-
-    res = server.make_request("POST", "/rerank", data={
-        "query": query,
-        "documents": [
-            doc1,
-            doc2,
-        ]
-    })
-    assert res.status_code == 200
-    assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
-    assert res.body['usage']['prompt_tokens'] == n_tokens
--- a/examples/server/tests/unit/test_speculative.py
+++ b/examples/server/tests/unit/test_speculative.py
@@ -82,37 +82,6 @@ def test_different_draft_min_draft_max():
        last_content = res.body["content"]


-def test_slot_ctx_not_exceeded():
-    global server
-    server.n_ctx = 64
-    server.start()
-    res = server.make_request("POST", "/completion", data={
-        "prompt": "Hello " * 56,
-        "temperature": 0.0,
-        "top_k": 1,
-        "speculative.p_min": 0.0,
-    })
-    assert res.status_code == 200
-    assert len(res.body["content"]) > 0
-
-
-def test_with_ctx_shift():
-    global server
-    server.n_ctx = 64
-    server.start()
-    res = server.make_request("POST", "/completion", data={
-        "prompt": "Hello " * 56,
-        "temperature": 0.0,
-        "top_k": 1,
-        "n_predict": 64,
-        "speculative.p_min": 0.0,
-    })
-    assert res.status_code == 200
-    assert len(res.body["content"]) > 0
-    assert res.body["tokens_predicted"] == 64
-    assert res.body["truncated"] == True
-
-
@pytest.mark.parametrize("n_slots,n_requests", [
    (1, 2),
    (2, 2),
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -64,8 +64,6 @@ class ServerProcess:
    server_embeddings: bool | None = False
    server_reranking: bool | None = False
    server_metrics: bool | None = False
-    server_slots: bool | None = False
-    pooling: str | None = None
    draft: int | None = None
    api_key: str | None = None
    response_format: str | None = None
@@ -73,7 +71,6 @@ class ServerProcess:
    disable_ctx_shift: int | None = False
    draft_min: int | None = None
    draft_max: int | None = None
-    no_webui: bool | None = None

    # session variables
    process: subprocess.Popen | None = None
@@ -94,6 +91,7 @@ class ServerProcess:
        else:
            server_path = "../../../build/bin/llama-server"
        server_args = [
+            "--slots",  # requires to get slot status via /slots endpoint
            "--host",
            self.server_host,
            "--port",
@@ -131,10 +129,6 @@ class ServerProcess:
            server_args.append("--reranking")
        if self.server_metrics:
            server_args.append("--metrics")
-        if self.server_slots:
-            server_args.append("--slots")
-        if self.pooling:
-            server_args.extend(["--pooling", self.pooling])
        if self.model_alias:
            server_args.extend(["--alias", self.model_alias])
        if self.n_ctx:
@@ -162,8 +156,6 @@ class ServerProcess:
            server_args.extend(["--draft-max", self.draft_max])
        if self.draft_min:
            server_args.extend(["--draft-min", self.draft_min])
-        if self.no_webui:
-            server_args.append("--no-webui")

        args = [str(arg) for arg in [server_path, *server_args]]
        print(f"bench: starting server with: {' '.join(args)}")
@@ -189,7 +181,7 @@ class ServerProcess:
        start_time = time.time()
        while time.time() - start_time < timeout_seconds:
            try:
-                response = self.make_request("GET", "/health", headers={
+                response = self.make_request("GET", "/slots", headers={
                    "Authorization": f"Bearer {self.api_key}" if self.api_key else None
                })
                if response.status_code == 200:
@@ -232,7 +224,7 @@ class ServerProcess:
        result.headers = dict(response.headers)
        result.status_code = response.status_code
        result.body = response.json() if parse_body else None
-        print("Response from server", json.dumps(result.body, indent=2))
+        print("Response from server", result.body)
        return result

    def make_stream_request(
@@ -253,7 +245,7 @@ class ServerProcess:
                break
            elif line.startswith('data: '):
                data = json.loads(line[6:])
-                print("Partial response from server", json.dumps(data, indent=2))
+                print("Partial response from server", data)
                yield data


@@ -377,6 +369,3 @@ def match_regex(regex: str, text: str) -> bool:
        ).search(text)
        is not None
    )
-
-def is_slow_test_allowed():
-    return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON"
--- a/Show More
+++ b/Show More