async direct io per tensor test

address review comments
Direct I/O and Transparent HugePages
2026-02-12 14:03:20 +02:00 · 2024-05-22 01:08:52 +02:00 · 2024-05-21 20:05:26 +02:00 · 2024-05-21 01:35:23 +02:00
266 changed files with 24290 additions and 89404 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -31,6 +31,6 @@ ENV LLAMA_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1

-RUN make -j$(nproc)
+RUN make

 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev

-RUN make -j$(nproc)
+RUN make

 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -18,7 +18,7 @@ COPY . .
 ENV LLAMA_CURL=1


-RUN make -j$(nproc)
+RUN make

 ENV LC_ALL=C.utf8

--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -23,7 +23,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV LLAMA_CUDA=1

-RUN make -j$(nproc)
+RUN make

 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@@ -2,14 +2,6 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04

 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build

-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
-    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
-    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
-    rm /etc/apt/sources.list.d/intel-graphics.list && \
-    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
-    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
-    chmod 644 /usr/share/keyrings/intel-graphics.gpg
-
 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++

-RUN make -j$(nproc)
+RUN make

 ENTRYPOINT [ "/app/main" ]
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -9,7 +9,7 @@ WORKDIR /app

 COPY . .

-RUN make -j$(nproc)
+RUN make

 FROM ubuntu:$UBUNTU_VERSION as runtime

--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@@ -25,7 +25,7 @@ ENV LLAMA_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1

-RUN make -j$(nproc)
+RUN make

 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@@ -2,14 +2,6 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04

 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build

-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
-    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
-    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
-    rm /etc/apt/sources.list.d/intel-graphics.list && \
-    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
-    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
-    chmod 644 /usr/share/keyrings/intel-graphics.gpg
-
 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git libcurl4-openssl-dev
@@ -27,14 +19,6 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \

 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
-    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
-    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
-    rm /etc/apt/sources.list.d/intel-graphics.list && \
-    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
-    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
-    chmod 644 /usr/share/keyrings/intel-graphics.gpg
-
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev

--- a/.devops/server-rocm.Dockerfile
+++ b/.devops/server-rocm.Dockerfile
@@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev

-RUN make -j$(nproc)
+RUN make

 ENTRYPOINT [ "/app/server" ]
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@@ -11,7 +11,7 @@ COPY . .

 ENV LLAMA_CURL=1

-RUN make -j$(nproc)
+RUN make

 FROM ubuntu:$UBUNTU_VERSION as runtime

--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -8,7 +8,7 @@ arg1="$1"
 shift

 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert-hf-to-gguf.py "$@"
+    python3 ./convert.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    ./quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
--- a/.github/ISSUE_TEMPLATE/01-bug-low.yml
+++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml
@@ -1,50 +0,0 @@
-name: Low Severity Bugs
-description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
-title: "Bug: "
-labels: ["bug-unconfirmed", "low severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./main --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml
+++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
@@ -1,50 +0,0 @@
-name: Medium Severity Bug
-description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
-title: "Bug: "
-labels: ["bug-unconfirmed", "medium severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./main --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/03-bug-high.yml
+++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml
@@ -1,50 +0,0 @@
-name: High Severity Bug
-description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
-title: "Bug: "
-labels: ["bug-unconfirmed", "high severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./main --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml
+++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
@@ -1,50 +0,0 @@
-name: Critical Severity Bug
-description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
-title: "Bug: "
-labels: ["bug-unconfirmed", "critical severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./main --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/05-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/05-enhancement.yml
@@ -1,51 +0,0 @@
-name: Enhancement
-description: Used to request enhancements for llama.cpp
-title: "Feature Request: "
-labels: ["enhancement"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
-
-  - type: checkboxes
-    id: prerequisites
-    attributes:
-      label: Prerequisites
-      description: Please confirm the following before submitting your enhancement request.
-      options:
-        - label: I am running the latest code. Mention the version if possible as well.
-          required: true
-        - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
-          required: true
-        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
-          required: true
-        - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
-          required: true
-
-  - type: textarea
-    id: feature-description
-    attributes:
-      label: Feature Description
-      description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
-      placeholder: Detailed description of the enhancement
-    validations:
-      required: true
-
-  - type: textarea
-    id: motivation
-    attributes:
-      label: Motivation
-      description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
-      placeholder: Explanation of why this feature is needed and its benefits
-    validations:
-      required: true
-
-  - type: textarea
-    id: possible-implementation
-    attributes:
-      label: Possible Implementation
-      description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
-      placeholder: Detailed description of potential implementation
-    validations:
-      required: false
--- a/.github/ISSUE_TEMPLATE/06-research.yml
+++ b/.github/ISSUE_TEMPLATE/06-research.yml
@@ -1,52 +0,0 @@
-name: Research
-description: Track new technical research area
-title: "Research: "
-labels: ["research 🔬"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
-
-  - type: checkboxes
-    id: research-stage
-    attributes:
-      label: Research Stage
-      description: Track general state of this research ticket
-      options:
-        - label: Background Research (Let's try to avoid reinventing the wheel)
-        - label: Hypothesis Formed (How do you think this will work and it's effect?)
-        - label: Strategy / Implementation Forming
-        - label: Analysis of results
-        - label: Debrief / Documentation (So people in the future can learn from us)
-
-  - type: textarea
-    id: background
-    attributes:
-      label: Previous existing literature and research
-      description: Whats the current state of the art and whats the motivation for this research?
-
-  - type: textarea
-    id: hypothesis
-    attributes:
-      label: Hypothesis
-      description: How do you think this will work and it's effect?
-
-  - type: textarea
-    id: implementation
-    attributes:
-      label: Implementation
-      description: Got an approach? e.g. a PR ready to go?
-
-  - type: textarea
-    id: analysis
-    attributes:
-      label: Analysis
-      description: How does the proposed implementation behave?
-
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/07-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/07-refactor.yml
@@ -1,28 +0,0 @@
-name: Refactor (Maintainers)
-description: Used to track refactoring opportunities
-title: "Refactor: "
-labels: ["refactor"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
-        Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
-
-  - type: textarea
-    id: background-description
-    attributes:
-      label: Background Description
-      description: Please provide a detailed written description of the pain points you are trying to solve.
-      placeholder: Detailed description behind your motivation to request refactor
-    validations:
-      required: true
-
-  - type: textarea
-    id: possible-approaches
-    attributes:
-      label: Possible Refactor Approaches
-      description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
-      placeholder: Your idea of possible refactoring opportunity/approaches
-    validations:
-      required: false
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ b/.github/ISSUE_TEMPLATE/bug.md
@@ -0,0 +1,11 @@
+---
+name: Bug template
+about: Used to report bugs in llama.cpp
+labels: ["bug-unconfirmed"]
+assignees: ''
+
+---
+
+Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
+
+If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,13 +0,0 @@
-blank_issues_enabled: true
-contact_links:
-  - name: Got an idea?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
-    about: Pop it there. It may then become an enhancement ticket.
-  - name: Got a question?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
-    about: Ask a question there!
-  - name: Want to contribute?
-    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
-    about: Head to the contribution guide page of the wiki for areas you can help with
-
-
--- a/.github/ISSUE_TEMPLATE/enhancement.md
+++ b/.github/ISSUE_TEMPLATE/enhancement.md
@@ -0,0 +1,28 @@
+---
+name: Enhancement template
+about: Used to request enhancements for llama.cpp
+labels: ["enhancement"]
+assignees: ''
+
+---
+
+# Prerequisites
+
+Please answer the following questions for yourself before submitting an issue.
+
+- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
+- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
+- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
+
+# Feature Description
+
+Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
+
+# Motivation
+
+Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
+
+# Possible Implementation
+
+If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,16 +1,5 @@
 # https://github.com/actions/labeler
-Kompute:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml-kompute.h
-            - ggml-kompute.cpp
-            - README-kompute.md
-Apple Metal:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml-metal.h
-            - ggml-metal.cpp
-            - README-metal.md
+
 SYCL:
    - changed-files:
        - any-glob-to-any-file:
@@ -20,7 +9,6 @@ SYCL:
 Nvidia GPU:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml-cuda.h
            - ggml-cuda/**
 Vulkan:
    - changed-files:
@@ -74,8 +62,6 @@ server:
 ggml:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml.c
-            - ggml.h
            - ggml-*.c
            - ggml-*.h
            - ggml-cuda/**
@@ -85,6 +71,3 @@ nix:
            - "**/*.nix"
            - .github/workflows/nix-*.yml
            - .devops/nix/nixpkgs-instances.nix
-embedding:
-    - changed-files:
-        - any-glob-to-any-file: examples/embedding/
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -294,22 +294,12 @@ jobs:

      - name: Build
        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          mkdir build
          cd build
          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)

-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
-          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
-
      - name: Test
        id: cmake_test
        run: |
--- a/.github/workflows/zig-build.yml
+++ b/.github/workflows/zig-build.yml
@@ -0,0 +1,29 @@
+name: Zig CI
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on: [ubuntu-latest, macos-latest, windows-latest]
+    runs-on: ${{ matrix.runs-on }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          fetch-depth: 0
+      - uses: goto-bus-stop/setup-zig@v2
+        with:
+          version: 0.11.0
+      - name: Build Summary
+        run: zig build --summary all -freference-trace
--- a/.gitignore
+++ b/.gitignore
@@ -105,7 +105,6 @@ examples/jeopardy/results.txt
 examples/server/*.html.hpp
 examples/server/*.js.hpp
 examples/server/*.mjs.hpp
-examples/server/*.css.hpp

 poetry.lock
 poetry.toml
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,7 +72,6 @@ else()
    set(INS_ENB ON)
 endif()

-option(LLAMA_SVE                             "llama: enable SVE"                                OFF)
 option(LLAMA_AVX                             "llama: enable AVX"                                ${INS_ENB})
 option(LLAMA_AVX2                            "llama: enable AVX2"                               ${INS_ENB})
 option(LLAMA_AVX512                          "llama: enable AVX512"                             OFF)
@@ -106,7 +105,6 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "llama: max. batch size for using peer access")
 option(LLAMA_CUDA_NO_PEER_COPY               "llama: do not use peer to peer copies"            OFF)
 option(LLAMA_CUDA_NO_VMM                     "llama: do not try to use CUDA VMM"                OFF)
-option(LLAMA_CUDA_FA_ALL_QUANTS              "llama: compile all quants for FlashAttention"     OFF)

 option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
@@ -126,7 +124,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_RPC                             "llama: use RPC"                                   OFF)
-option(LLAMA_OPENMP                          "llama: use OpenMP"                                ON)
+option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
 set(LLAMA_SYCL_TARGET   "INTEL" CACHE STRING "llama: sycl target device")
@@ -297,17 +295,6 @@ if (LLAMA_METAL)
        )
 endif()

-if (LLAMA_OPENMP)
-    find_package(OpenMP)
-    if (OpenMP_FOUND)
-        message(STATUS "OpenMP found")
-        add_compile_definitions(GGML_USE_OPENMP)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-    else()
-        message(WARNING "OpenMP not found")
-    endif()
-endif()
-
 if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
@@ -397,6 +384,10 @@ if (LLAMA_LLAMAFILE)
    set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
 endif()

+if (LLAMA_QKK_64)
+    add_compile_definitions(GGML_QKK_64)
+endif()
+
 if (LLAMA_CUBLAS)
    message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
    set(LLAMA_CUDA ON)
@@ -415,8 +406,6 @@ if (LLAMA_CUDA)

        file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
        list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
-        file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})

        add_compile_definitions(GGML_USE_CUDA)
        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
@@ -442,18 +431,6 @@ if (LLAMA_CUDA)
        if (LLAMA_CUDA_NO_PEER_COPY)
            add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
        endif()
-        if (LLAMA_CUDA_FA_ALL_QUANTS)
-            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
-        else()
-            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        endif()

        if (LLAMA_STATIC)
            if (WIN32)
@@ -528,12 +505,6 @@ if (LLAMA_VULKAN)

        add_compile_definitions(GGML_USE_VULKAN)

-        # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
-        # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
-        if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-            add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
-        endif()
-
        if (LLAMA_VULKAN_CHECK_RESULTS)
            add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
        endif()
@@ -598,8 +569,6 @@ if (LLAMA_HIPBLAS)

    file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
    list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
-    file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
-    list(APPEND GGML_SOURCES_ROCM ${SRCS})

    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)

@@ -619,19 +588,6 @@ if (LLAMA_HIPBLAS)
        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
    endif()

-    if (LLAMA_CUDA_FA_ALL_QUANTS)
-        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
-    else()
-        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-    endif()
-
    add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
    add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
    add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@@ -670,10 +626,6 @@ if (LLAMA_SYCL)
        add_compile_definitions(GGML_SYCL_F16)
    endif()

-    if (LLAMA_CUDA_FORCE_MMQ)
-        add_compile_definitions(GGML_SYCL_FORCE_MMQ)
-    endif()
-
    add_compile_options(-I./) #include DPCT
    add_compile_options(-I/${SYCL_INCLUDE_DIR})

@@ -789,7 +741,6 @@ if (LLAMA_KOMPUTE)
            kompute-shaders/op_mul_mat_q4_0.comp
            kompute-shaders/op_mul_mat_q4_1.comp
            kompute-shaders/op_mul_mat_q6_k.comp
-            kompute-shaders/op_getrows_f32.comp
            kompute-shaders/op_getrows_f16.comp
            kompute-shaders/op_getrows_q4_0.comp
            kompute-shaders/op_getrows_q4_1.comp
@@ -822,7 +773,6 @@ if (LLAMA_KOMPUTE)
            shaderop_mul_mat_q4_0.h
            shaderop_mul_mat_q4_1.h
            shaderop_mul_mat_q6_k.h
-            shaderop_getrows_f32.h
            shaderop_getrows_f16.h
            shaderop_getrows_q4_0.h
            shaderop_getrows_q4_1.h
@@ -1089,9 +1039,6 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
            # Raspberry Pi 3, 4, Zero 2 (32-bit)
            list(APPEND ARCH_FLAGS -mno-unaligned-access)
        endif()
-        if (LLAMA_SVE)
-            list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
-        endif()
    endif()
 elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
@@ -1358,7 +1305,7 @@ set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}
 install(TARGETS llama LIBRARY PUBLIC_HEADER)

 install(
-    FILES convert-hf-to-gguf.py
+    FILES convert.py
    PERMISSIONS
        OWNER_READ
        OWNER_WRITE
@@ -1385,13 +1332,6 @@ if (LLAMA_METAL)
    endif()
 endif()

-configure_file(cmake/llama.pc.in
-        "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
-        @ONLY)
-
-install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
-        DESTINATION lib/pkgconfig)
-
 #
 # programs, examples and tests
 #
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -1,4 +1,4 @@
-{
+{
  "version": 4,
  "configurePresets": [
    {
@@ -40,10 +40,6 @@

    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "release" ] },
-    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "release", "static" ] },
-
-    { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
-    { "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
-    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
+    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "release", "static" ] }
  ]
 }
--- a/81
+++ b/81
@@ -57,8 +57,6 @@ ifeq ($(UNAME_S),Darwin)
 		LLAMA_METAL := 1
 	endif

-	LLAMA_NO_OPENMP := 1
-
 	ifneq ($(UNAME_P),arm)
 		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
 		ifeq ($(SYSCTL_M),1)
@@ -69,10 +67,6 @@ ifeq ($(UNAME_S),Darwin)
 	endif
 endif

-ifdef LLAMA_RPC
-	BUILD_TARGETS += rpc-server
-endif
-
 default: $(BUILD_TARGETS)

 test: $(TEST_TARGETS)
@@ -141,16 +135,12 @@ MK_NVCCFLAGS = -std=c++11
 ifdef LLAMA_FAST
 MK_CFLAGS     += -Ofast
 HOST_CXXFLAGS += -Ofast
-ifndef LLAMA_DEBUG
 MK_NVCCFLAGS  += -O3
-endif # LLAMA_DEBUG
 else
 MK_CFLAGS     += -O3
 MK_CXXFLAGS   += -O3
-ifndef LLAMA_DEBUG
 MK_NVCCFLAGS  += -O3
-endif # LLAMA_DEBUG
-endif # LLAMA_FAST
+endif

 ifndef LLAMA_NO_CCACHE
 CCACHE := $(shell which ccache)
@@ -211,10 +201,9 @@ ifdef LLAMA_SCHED_MAX_COPIES
 endif

 ifdef LLAMA_DEBUG
-	MK_CFLAGS    += -O0 -g
-	MK_CXXFLAGS  += -O0 -g
-	MK_LDFLAGS   += -g
-	MK_NVCCFLAGS += -O0 -g
+	MK_CFLAGS   += -O0 -g
+	MK_CXXFLAGS += -O0 -g
+	MK_LDFLAGS  += -g

 	ifeq ($(UNAME_S),Linux)
 		MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
@@ -400,6 +389,10 @@ else
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
 endif

+ifdef LLAMA_QKK_64
+	MK_CPPFLAGS += -DGGML_QKK_64
+endif
+
 ifndef LLAMA_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
@@ -411,12 +404,6 @@ ifndef LLAMA_NO_ACCELERATE
 	endif
 endif # LLAMA_NO_ACCELERATE

-ifndef LLAMA_NO_OPENMP
-	MK_CPPFLAGS += -DGGML_USE_OPENMP
-	MK_CFLAGS   += -fopenmp
-	MK_CXXFLAGS += -fopenmp
-endif # LLAMA_NO_OPENMP
-
 ifdef LLAMA_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
@@ -433,25 +420,11 @@ ifdef LLAMA_BLIS
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
 endif # LLAMA_BLIS

-ifdef LLAMA_RPC
-	MK_CPPFLAGS   += -DGGML_USE_RPC
-	OBJS          += ggml-rpc.o
-endif # LLAMA_RPC
-
 ifdef LLAMA_CUBLAS
 # LLAMA_CUBLAS is deprecated and will be removed in the future
 	LLAMA_CUDA := 1
 endif

-OBJS_CUDA_TEMP_INST      = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
-ifdef LLAMA_CUDA_FA_ALL_QUANTS
-	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
-else
-	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
-	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
-	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
-endif # LLAMA_CUDA_FA_ALL_QUANTS
-
 ifdef LLAMA_CUDA
 	ifneq ('', '$(wildcard /opt/cuda)')
 		CUDA_PATH ?= /opt/cuda
@@ -462,7 +435,6 @@ ifdef LLAMA_CUDA
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
 	OBJS         += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
-	OBJS         += $(OBJS_CUDA_TEMP_INST)
 	MK_NVCCFLAGS += -use_fast_math
 ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
@@ -473,9 +445,6 @@ endif # JETSON_EOL_MODULE_DETECT
 ifdef LLAMA_DEBUG
 	MK_NVCCFLAGS += -lineinfo
 endif # LLAMA_DEBUG
-ifdef LLAMA_CUDA_DEBUG
-	MK_NVCCFLAGS += --device-debug
-endif # LLAMA_CUDA_DEBUG
 ifdef LLAMA_CUDA_NVCC
 	NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
 else
@@ -525,10 +494,7 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
 endif # LLAMA_CUDA_NO_PEER_COPY
 ifdef LLAMA_CUDA_CCBIN
 	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
-endif # LLAMA_CUDA_CCBIN
-ifdef LLAMA_CUDA_FA_ALL_QUANTS
-	MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
-endif # LLAMA_CUDA_FA_ALL_QUANTS
+endif

 ifdef JETSON_EOL_MODULE_DETECT
 define NVCC_COMPILE
@@ -540,7 +506,7 @@ define NVCC_COMPILE
 endef # NVCC_COMPILE
 endif # JETSON_EOL_MODULE_DETECT

-ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
+ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
 	$(NVCC_COMPILE)

 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
@@ -606,7 +572,6 @@ ifdef LLAMA_HIP_UMA
 	MK_CPPFLAGS += -DGGML_HIP_UMA
 endif # LLAMA_HIP_UMA
 	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
-	MK_LDFLAGS  += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
 	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
 	HIPFLAGS    += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
 	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
@@ -620,12 +585,11 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
 endif # LLAMA_CUDA_NO_PEER_COPY
 	OBJS        += ggml-cuda.o
 	OBJS        += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
-	OBJS        += $(OBJS_CUDA_TEMP_INST)

 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<

-ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
+ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<

 endif # LLAMA_HIPBLAS
@@ -663,26 +627,11 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
 endif
 endif # LLAMA_METAL

-OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
-COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
-
 ifndef LLAMA_NO_LLAMAFILE
 sgemm.o: sgemm.cpp sgemm.h ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif

-ifdef LLAMA_RPC
-ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-endif # LLAMA_RPC
-
 GF_CC := $(CC)
 include scripts/get-flags.mk

@@ -762,9 +711,14 @@ unicode.o: unicode.cpp unicode.h
 unicode-data.o: unicode-data.cpp unicode-data.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

+OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
+
 llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

+COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
+COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
+
 common.o: common/common.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@

@@ -795,7 +749,6 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
 clean:
 	rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
 	rm -vrf ggml-cuda/*.o
-	rm -vrf ggml-cuda/template-instances/*.o
 	find examples pocs -type f -name "*.o" -delete

 #
@@ -864,7 +817,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)

--- a/README-sycl.md
+++ b/README-sycl.md
@@ -54,10 +54,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,

 ## OS

-| OS      | Status  | Verified                                       |
-|---------|---------|------------------------------------------------|
-| Linux   | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
-| Windows | Support | Windows 11                                     |
+| OS      | Status  | Verified                           |
+|---------|---------|------------------------------------|
+| Linux   | Support | Ubuntu 22.04, Fedora Silverblue 39 |
+| Windows | Support | Windows 11                         |


 ## Hardware
@@ -70,7 +70,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
 |-------------------------------|---------|---------------------------------------|
 | Intel Data Center Max Series  | Support | Max 1550, 1100                        |
 | Intel Data Center Flex Series | Support | Flex 170                              |
-| Intel Arc Series              | Support | Arc 770, 730M, Arc A750               |
+| Intel Arc Series              | Support | Arc 770, 730M                         |
 | Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
 | Intel iGPU                    | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |

--- a/README.md
+++ b/README.md
@@ -2,9 +2,7 @@

 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

-[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
-[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
+[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)

 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)

@@ -22,8 +20,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Hot topics

- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
+- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
 - BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
 - MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
 - Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
@@ -130,7 +127,6 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)

 (instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))

@@ -144,14 +140,11 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
 - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
 - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)

 **HTTP server**

 [llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.

-[simplechat](./examples/server/public_simplechat) is a simple chat client, which can be used to chat with the model exposed using above web server (use --path to point to simplechat), from a local web browser.
-
 **Bindings:**

 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
@@ -205,14 +198,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
 - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [AIKit](https://github.com/sozercan/aikit) (MIT)

 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

-**Tools:**
-
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
-
 ---

 Here is a typical run using LLaMA v2 13B on M2 Ultra:
@@ -321,6 +309,8 @@ In order to build llama.cpp you have four different options.
      make
      ```

+      **Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`
+
  - On Windows:

    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
@@ -332,32 +322,23 @@ In order to build llama.cpp you have four different options.
        make
        ```

-  - Notes:
-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
-    - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, run `make LLAMA_DEBUG=1`
-
 - Using `CMake`:

-  ```bash
-  cmake -B build
-  cmake --build build --config Release
-  ```
+    ```bash
+    cmake -B build
+    cmake --build build --config Release
+    ```

-  **Notes**:
+    **Note**: for `Debug` builds, there are two cases:

-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
-    - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, there are two cases:
-
-      1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
+    - Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):

      ```bash
      cmake -B build -DCMAKE_BUILD_TYPE=Debug
      cmake --build build
      ```

-      2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
+    - Multi-config generators (`-G` param set to Visual Studio, XCode...):

      ```bash
      cmake -B build -G "Xcode"
@@ -392,14 +373,6 @@ In order to build llama.cpp you have four different options.
    CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
    the instructions for use and activate this options in this document below.

-### Homebrew
-
-On Mac and Linux, the homebrew package manager can be used via
-```
-brew install llama.cpp
-```
-The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
-
 ### Metal Build

 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
@@ -498,12 +471,10 @@ Building the program with BLAS support may lead to some performance improvements
  |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
  | LLAMA_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
  | LLAMA_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
-  | LLAMA_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                               |
-  | LLAMA_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. |                                                                                                                                         |
+  | LLAMA_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
  | LLAMA_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
  | LLAMA_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
  | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
-  | LLAMA_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |

 - #### hipBLAS

@@ -719,8 +690,7 @@ Building the program with BLAS support may lead to some performance improvements

 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.

-Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
-It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
+Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.

 ```bash
 # obtain the official LLaMA model weights and place them in ./models
@@ -737,10 +707,10 @@ ls ./models
 python3 -m pip install -r requirements.txt

 # convert the model to ggml FP16 format
-python3 convert-hf-to-gguf.py models/mymodel/
+python3 convert.py models/mymodel/

 # [Optional] for models using BPE tokenizers
-python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
+python convert.py models/mymodel/ --vocab-type bpe

 # quantize the model to 4-bits (using Q4_K_M method)
 ./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
--- a/build.zig
+++ b/build.zig
@@ -0,0 +1,172 @@
+// Compatible with Zig Version 0.11.0
+const std = @import("std");
+const ArrayList = std.ArrayList;
+const Compile = std.Build.Step.Compile;
+const ConfigHeader = std.Build.Step.ConfigHeader;
+const Mode = std.builtin.Mode;
+const CrossTarget = std.zig.CrossTarget;
+
+const Maker = struct {
+    builder: *std.build.Builder,
+    target: CrossTarget,
+    optimize: Mode,
+    enable_lto: bool,
+
+    include_dirs: ArrayList([]const u8),
+    cflags: ArrayList([]const u8),
+    cxxflags: ArrayList([]const u8),
+    objs: ArrayList(*Compile),
+
+    fn addInclude(m: *Maker, dir: []const u8) !void {
+        try m.include_dirs.append(dir);
+    }
+    fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
+        try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
+    }
+    fn addCFlag(m: *Maker, flag: []const u8) !void {
+        try m.cflags.append(flag);
+    }
+    fn addCxxFlag(m: *Maker, flag: []const u8) !void {
+        try m.cxxflags.append(flag);
+    }
+    fn addFlag(m: *Maker, flag: []const u8) !void {
+        try m.addCFlag(flag);
+        try m.addCxxFlag(flag);
+    }
+
+    fn init(builder: *std.build.Builder) !Maker {
+        const target = builder.standardTargetOptions(.{});
+        const zig_version = @import("builtin").zig_version_string;
+        const commit_hash = try std.ChildProcess.exec(
+            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
+        );
+        try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
+            \\int LLAMA_BUILD_NUMBER = {};
+            \\char const *LLAMA_COMMIT = "{s}";
+            \\char const *LLAMA_COMPILER = "Zig {s}";
+            \\char const *LLAMA_BUILD_TARGET = "{s}";
+            \\
+        , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
+        var m = Maker{
+            .builder = builder,
+            .target = target,
+            .optimize = builder.standardOptimizeOption(.{}),
+            .enable_lto = false,
+            .include_dirs = ArrayList([]const u8).init(builder.allocator),
+            .cflags = ArrayList([]const u8).init(builder.allocator),
+            .cxxflags = ArrayList([]const u8).init(builder.allocator),
+            .objs = ArrayList(*Compile).init(builder.allocator),
+        };
+
+        try m.addCFlag("-std=c11");
+        try m.addCxxFlag("-std=c++11");
+        try m.addProjectInclude(&.{});
+        try m.addProjectInclude(&.{"common"});
+        return m;
+    }
+
+    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
+        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
+        if (o.target.getAbi() != .msvc)
+            o.defineCMacro("_GNU_SOURCE", null);
+
+        if (std.mem.endsWith(u8, src, ".c")) {
+            o.addCSourceFiles(&.{src}, m.cflags.items);
+            o.linkLibC();
+        } else {
+            o.addCSourceFiles(&.{src}, m.cxxflags.items);
+            if (o.target.getAbi() == .msvc) {
+                o.linkLibC(); // need winsdk + crt
+            } else {
+                // linkLibCpp already add (libc++ + libunwind + libc)
+                o.linkLibCpp();
+            }
+        }
+        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
+        o.want_lto = m.enable_lto;
+        return o;
+    }
+
+    fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
+        const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
+        e.addCSourceFiles(&.{src}, m.cxxflags.items);
+        for (deps) |d| e.addObject(d);
+        for (m.objs.items) |o| e.addObject(o);
+        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
+
+        // https://github.com/ziglang/zig/issues/15448
+        if (e.target.getAbi() == .msvc) {
+            e.linkLibC(); // need winsdk + crt
+        } else {
+            // linkLibCpp already add (libc++ + libunwind + libc)
+            e.linkLibCpp();
+        }
+        m.builder.installArtifact(e);
+        e.want_lto = m.enable_lto;
+        return e;
+    }
+};
+
+pub fn build(b: *std.build.Builder) !void {
+    var make = try Maker.init(b);
+    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
+
+    const ggml = make.obj("ggml", "ggml.c");
+    const sgemm = make.obj("sgemm", "sgemm.cpp");
+    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
+    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
+    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
+    const unicode = make.obj("unicode", "unicode.cpp");
+    const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
+    const llama = make.obj("llama", "llama.cpp");
+    const buildinfo = make.obj("common", "common/build-info.cpp");
+    const common = make.obj("common", "common/common.cpp");
+    const console = make.obj("console", "common/console.cpp");
+    const sampling = make.obj("sampling", "common/sampling.cpp");
+    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
+    const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
+    const train = make.obj("train", "common/train.cpp");
+    const clip = make.obj("clip", "examples/llava/clip.cpp");
+    const llava = make.obj("llava", "examples/llava/llava.cpp");
+
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
+
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
+    if (server.target.isWindows()) {
+        server.linkSystemLibrary("ws2_32");
+    }
+
+    const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
+    for (server_assets) |asset| {
+        const input_path = b.fmt("examples/server/public/{s}", .{asset});
+        const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
+
+        // Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
+
+        const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
+        defer b.allocator.free(input);
+
+        var buf = std.ArrayList(u8).init(b.allocator);
+        defer buf.deinit();
+
+        for (input) |byte| {
+            try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
+        }
+
+        var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
+        defer b.allocator.free(name);
+        std.mem.replaceScalar(u8, name, '.', '_');
+
+        try std.fs.cwd().writeFile(output_path, b.fmt(
+            "unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
+            .{ name, buf.items, name, input.len },
+        ));
+
+        std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
+    }
+}
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -202,15 +202,12 @@ function gg_sum_test_scripts_release {
 }

 function gg_get_model {
-    local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
-    local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
-    local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
-    if [[ -s $gguf_0 ]]; then
-        echo -n "$gguf_0"
-    elif [[ -s $gguf_1 ]]; then
-        echo -n "$gguf_1"
-    elif [[ -s $gguf_2 ]]; then
-        echo -n "$gguf_2"
+    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
+    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
+    if [[ -s $gguf_3b ]]; then
+        echo -n "$gguf_3b"
+    elif [[ -s $gguf_7b ]]; then
+        echo -n "$gguf_7b"
    else
        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
        exit 1
@@ -259,6 +256,139 @@ function gg_sum_ctest_with_model_release {
    gg_printf '```\n'
 }

+# open_llama_3b_v2
+
+function gg_run_open_llama_3b_v2 {
+    cd ${SRC}
+
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
+
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
+
+    path_models="../models-mnt/open-llama/3B-v2"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert.py ${path_models}
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+
+    wiki_test_60="${path_wiki}/wiki.test-60.raw"
+
+    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+
+    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
+    (time ./bin/save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
+    set +e
+}
+
+function gg_sum_open_llama_3b_v2 {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'OpenLLaMA 3B-v2:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+}
+
 # open_llama_7b_v2
 # requires: GG_BUILD_CUDA

@@ -287,7 +417,7 @@ function gg_run_open_llama_7b_v2 {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert.py ${path_models}

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -396,272 +526,6 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
 }

-# pythia_1.4b
-
-function gg_run_pythia_1_4b {
-    cd ${SRC}
-
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
-
-    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
-
-    path_models="../models-mnt/pythia/1.4B"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test_60="${path_wiki}/wiki.test-60.raw"
-
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
-
-    set +e
-}
-
-function gg_sum_pythia_1_4b {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Pythia 1.4B:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-}
-
-# pythia_2_8b
-# requires: GG_BUILD_CUDA
-
-function gg_run_pythia_2_8b {
-    cd ${SRC}
-
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
-
-    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-
-    path_models="../models-mnt/pythia/2.8B"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test="${path_wiki}/wiki.test.raw"
-
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/main --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
-
-    set +e
-}
-
-function gg_sum_pythia_2_8b {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Pythia 2.8B:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-}
-
 # bge-small

 function gg_run_embd_bge_small {
@@ -688,7 +552,7 @@ function gg_run_embd_bge_small {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert-hf-to-gguf.py ${path_models}

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -742,10 +606,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then

    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
        if [ -z ${GG_BUILD_CUDA} ]; then
-            test $ret -eq 0 && gg_run pythia_1_4b
+            test $ret -eq 0 && gg_run open_llama_3b_v2
        else
-            test $ret -eq 0 && gg_run pythia_2_8b
-            #test $ret -eq 0 && gg_run open_llama_7b_v2
+            test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
        test $ret -eq 0 && gg_run ctest_with_model_debug
        test $ret -eq 0 && gg_run ctest_with_model_release
--- a/cmake/llama.pc.in
+++ b/cmake/llama.pc.in
@@ -1,10 +0,0 @@
-prefix=@CMAKE_INSTALL_PREFIX@
-exec_prefix=${prefix}
-libdir=${exec_prefix}/lib
-includedir=${prefix}/include
-
-Name: llama
-Description: Port of Facebook's LLaMA model in C/C++
-Version: @PROJECT_VERSION@
-Libs: -L${libdir} -lllama
-Cflags: -I${includedir}
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@@ -27,7 +27,7 @@
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

 #define print_build_info() do {                                                                     \
-    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);      \
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);           \
    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)

@@ -35,18 +35,14 @@

 // build info
 extern int LLAMA_BUILD_NUMBER;
-extern char const * LLAMA_COMMIT;
-extern char const * LLAMA_COMPILER;
-extern char const * LLAMA_BUILD_TARGET;
+extern char const *LLAMA_COMMIT;
+extern char const *LLAMA_COMPILER;
+extern char const *LLAMA_BUILD_TARGET;

 struct llama_control_vector_load_info;

-//
-// CPU utils
-//
-
-int32_t cpu_get_num_physical_cores();
-int32_t cpu_get_num_math();
+int get_math_cpu_count();
+int32_t get_num_physical_cores();

 //
 // CLI argument parsing
@@ -55,7 +51,7 @@ int32_t cpu_get_num_math();
 struct gpt_params {
    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed

-    int32_t n_threads             = cpu_get_num_math();
+    int32_t n_threads             = get_math_cpu_count();
    int32_t n_threads_draft       = -1;
    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
    int32_t n_threads_batch_draft = -1;
@@ -146,7 +142,6 @@ struct gpt_params {
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
    bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
-    bool special           = false; // enable special token output
    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
@@ -165,6 +160,7 @@ struct gpt_params {
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
+    bool use_direct_io     = false; // use direct I/O
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
@@ -184,34 +180,33 @@ struct gpt_params {

 void gpt_params_handle_model_default(gpt_params & params);

-bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params);
-bool gpt_params_parse      (int argc, char ** argv, gpt_params & params);
-bool gpt_params_find_arg   (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
-void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
+bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);

-std::string gpt_params_get_system_info(const gpt_params & params);
+bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
+
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
+
+void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+
+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
+
+std::string get_system_info(const gpt_params & params);
+
+std::string gpt_random_prompt(std::mt19937 & rng);
+
+void process_escapes(std::string& input);
+
+bool validate_file_name(const std::string & filename);

 //
 // String utils
 //

+std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
+std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
 std::vector<std::string> string_split(std::string input, char separator);
-
 std::string string_strip(const std::string & str);
-std::string string_get_sortable_timestamp();
-std::string string_random_prompt(std::mt19937 & rng);
-
-bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
-void string_process_escapes(std::string & input);
-
-//
-// Filesystem utils
-//
-
-bool fs_validate_filename(const std::string & filename);
-bool fs_create_directory_with_parents(const std::string & path);
-
-std::string fs_get_cache_directory();
+std::string sampler_type_to_name_string(llama_sampler_type sampler_type);

 //
 // Model utils
@@ -282,15 +277,29 @@ std::string llama_detokenize_bpe(
 // defaults to true when model type is SPM, otherwise false.
 bool llama_should_add_bos_token(const llama_model * model);

+//
+// YAML utils
+//
+
+bool create_directory_with_parents(const std::string & path);
+void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
+void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
+void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
+std::string get_sortable_timestamp();
+
+void dump_non_result_info_yaml(
+    FILE * stream, const gpt_params & params, const llama_context * lctx,
+    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
+
 //
 // KV cache utils
 //

 // Dump the KV cache view with the number of sequences per cell.
-void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
+void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);

 // Dump the KV cache view showing individual sequences in each cell (long output).
-void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);

 //
 // Embedding utils
@@ -324,20 +333,6 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
 //
 // Split utils
 //
-
 static const char * const LLM_KV_SPLIT_NO            = "split.no";
 static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
-
-//
-// YAML utils
-//
-
-void yaml_dump_vector_float    (FILE * stream, const char * prop_name, const std::vector<float> & data);
-void yaml_dump_vector_int      (FILE * stream, const char * prop_name, const std::vector<int> & data);
-void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
-
-void yaml_dump_non_result_info(
-    FILE * stream, const gpt_params & params, const llama_context * lctx,
-    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
-
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -125,7 +125,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
    std::string result = "CFG -> Penalties ";
    if (params.mirostat == 0) {
        for (auto sampler_type : params.samplers_sequence) {
-            const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
+            const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
            if (!sampler_type_name.empty()) {
                result += "-> " + sampler_type_name + " ";
            }
@@ -137,87 +137,6 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
    return result;
 }

-std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
-    switch (sampler_type) {
-        case llama_sampler_type::TOP_K:       return "top_k";
-        case llama_sampler_type::TFS_Z:       return "tfs_z";
-        case llama_sampler_type::TYPICAL_P:   return "typical_p";
-        case llama_sampler_type::TOP_P:       return "top_p";
-        case llama_sampler_type::MIN_P:       return "min_p";
-        case llama_sampler_type::TEMPERATURE: return "temperature";
-        default : return "";
-    }
-}
-
-std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
-    std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
-        {"top_k",       llama_sampler_type::TOP_K},
-        {"top_p",       llama_sampler_type::TOP_P},
-        {"typical_p",   llama_sampler_type::TYPICAL_P},
-        {"min_p",       llama_sampler_type::MIN_P},
-        {"tfs_z",       llama_sampler_type::TFS_Z},
-        {"temperature", llama_sampler_type::TEMPERATURE}
-    };
-
-    // since samplers names are written multiple ways
-    // make it ready for both system names and input names
-    std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
-        {"top-k",       llama_sampler_type::TOP_K},
-        {"top-p",       llama_sampler_type::TOP_P},
-        {"nucleus",     llama_sampler_type::TOP_P},
-        {"typical-p",   llama_sampler_type::TYPICAL_P},
-        {"typical",     llama_sampler_type::TYPICAL_P},
-        {"min-p",       llama_sampler_type::MIN_P},
-        {"tfs-z",       llama_sampler_type::TFS_Z},
-        {"tfs",         llama_sampler_type::TFS_Z},
-        {"temp",        llama_sampler_type::TEMPERATURE}
-    };
-
-    std::vector<llama_sampler_type> sampler_types;
-    sampler_types.reserve(names.size());
-    for (const auto & name : names)
-    {
-        auto sampler_item = sampler_canonical_name_map.find(name);
-        if (sampler_item != sampler_canonical_name_map.end())
-        {
-            sampler_types.push_back(sampler_item->second);
-        }
-        else
-        {
-            if (allow_alt_names)
-            {
-                sampler_item = sampler_alt_name_map.find(name);
-                if (sampler_item != sampler_alt_name_map.end())
-                {
-                    sampler_types.push_back(sampler_item->second);
-                }
-            }
-        }
-    }
-    return sampler_types;
-}
-
-std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
-    std::unordered_map<char, llama_sampler_type> sampler_name_map {
-        {'k', llama_sampler_type::TOP_K},
-        {'p', llama_sampler_type::TOP_P},
-        {'y', llama_sampler_type::TYPICAL_P},
-        {'m', llama_sampler_type::MIN_P},
-        {'f', llama_sampler_type::TFS_Z},
-        {'t', llama_sampler_type::TEMPERATURE}
-    };
-
-    std::vector<llama_sampler_type> sampler_types;
-    sampler_types.reserve(names_string.size());
-    for (const auto & c : names_string) {
-        const auto sampler_item = sampler_name_map.find(c);
-        if (sampler_item != sampler_name_map.end()) {
-            sampler_types.push_back(sampler_item->second);
-        }
-    }
-    return sampler_types;
-}
-
 // no reasons to expose this function in header
 static void sampler_queue(
                   struct llama_context * ctx_main,
@@ -260,7 +179,7 @@ static llama_token llama_sampling_sample_impl(
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
                  const int idx,
-                  bool is_resampling) {
+                  bool is_resampling) {  // Add a parameter to indicate if we are resampling
    const llama_sampling_params & params = ctx_sampling->params;

    const float   temp            = params.temp;
@@ -269,8 +188,8 @@ static llama_token llama_sampling_sample_impl(
    const float   mirostat_eta    = params.mirostat_eta;

    std::vector<float> original_logits;
-    auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
-    if (ctx_sampling->grammar != NULL && !is_resampling) {
+    auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits);
+    if (!is_resampling) {
        GGML_ASSERT(!original_logits.empty());
    }
    llama_token id = 0;
@@ -333,7 +252,7 @@ static llama_token llama_sampling_sample_impl(
            // Restore logits from the copy
            std::copy(original_logits.begin(), original_logits.end(), logits);

-            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
+            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true);  // Pass true for is_resampling
        }
    }

@@ -366,8 +285,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);

-    if (ctx_sampling->grammar != NULL && !apply_grammar) {
-        GGML_ASSERT(original_logits != NULL);
+    if (apply_grammar && original_logits != NULL) {
        // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
        *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
    }
@@ -424,7 +342,7 @@ llama_token llama_sampling_sample(
                  struct llama_context * ctx_cfg,
                  const int idx) {
    // Call the implementation function with is_resampling set to false by default
-    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
+    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
 }

 llama_token_data_array llama_sampling_prepare(
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -116,11 +116,6 @@ std::string llama_sampling_print(const llama_sampling_params & params);
 // Print sampling order into a string
 std::string llama_sampling_order_print(const llama_sampling_params & params);

-std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
-
-std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
-std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
-
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {

    params.custom_n_ctx = false;

-    params.use_flash              = false;
+    params.use_flash              = true;
    params.use_checkpointing      = true;

    params.sample_start           = "";
@@ -1380,7 +1380,7 @@ bool consume_common_train_arg(

 void finish_processing_train_args(struct train_params_common * params) {
    if (params->escape) {
-        string_process_escapes(params->sample_start);
+        process_escapes(params->sample_start);
    }
 }

--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -72,7 +72,7 @@ models = [
    {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
    {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
    {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
-    {"name": "stablelm2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
+    {"name": "stablelm",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
    {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
    {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
@@ -81,7 +81,6 @@ models = [
    {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
-    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
 ]


--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -14,7 +14,6 @@ from pathlib import Path
 from hashlib import sha256
 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast

-import math
 import numpy as np
 import torch

@@ -25,6 +24,8 @@ if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf

+from convert import LlamaHfVocab
+
 logger = logging.getLogger("hf-to-gguf")


@@ -311,10 +312,11 @@ class Model:
                        data = data.astype(np.float32)
                    data_qtype = gguf.GGMLQuantizationType.F32

-                shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
-
+                block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
                # reverse shape to make it similar to the internal ggml dimension order
-                shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
+                shape_str = f"""{{{', '.join(str(n) for n in reversed(
+                    (*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
+                )}}}"""

                # n_dims is implicit in the shape
                logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
@@ -445,7 +447,7 @@ class Model:
            # ref: https://huggingface.co/openai-community/gpt2
            res = "gpt-2"
        if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
-            # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
+            # ref: https://huggingface.co/stabilityai/stablelm-2-1_6b
            res = "stablelm2"
        if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
            # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
@@ -471,9 +473,6 @@ class Model:
        if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
            res = "jina-v2-de"
-        if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
-            # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
-            res = "smaug-bpe"

        if res is None:
            logger.warning("\n")
@@ -632,7 +631,7 @@ class Model:
        special_vocab.add_to_gguf(self.gguf_writer)

    def _set_vocab_llama_hf(self):
-        vocab = gguf.LlamaHfVocab(self.dir_model)
+        vocab = LlamaHfVocab(self.dir_model)
        tokens = []
        scores = []
        toktypes = []
@@ -673,44 +672,6 @@ class GPTNeoXModel(Model):
        self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])

-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unused
-
-        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
-        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
-
-        tensors: list[tuple[str, Tensor]] = []
-
-        if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
-            # Map bloom-style qkv_linear to gpt-style qkv_linear
-            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
-            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
-            qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
-            data_torch = torch.cat(
-                (
-                    qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
-                    qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
-                    qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
-                ),
-                dim=0,
-            )
-            logger.info("re-format attention.linear_qkv.weight")
-        elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name):
-            qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
-            data_torch = torch.cat(
-                (
-                    qkv_bias[:, 0, :].reshape((n_embed,)),
-                    qkv_bias[:, 1, :].reshape((n_embed,)),
-                    qkv_bias[:, 2, :].reshape((n_embed,)),
-                ),
-                dim=0,
-            )
-            logger.info("re-format attention.linear_qkv.bias")
-
-        tensors.append((self.map_tensor_name(name), data_torch))
-
-        return tensors
-

@Model.register("BloomForCausalLM")
 class BloomModel(Model):
@@ -1315,17 +1276,6 @@ class LlamaModel(Model):
                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])

-        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
-        if tokenizer_config_file.is_file():
-            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
-                tokenizer_config_json = json.load(f)
-                if "add_prefix_space" in tokenizer_config_json:
-                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
-
-        # Apply to granite small models only
-        if self.hparams.get("vocab_size", 32000) == 49152:
-            self.gguf_writer.add_add_bos_token(False)
-
    @staticmethod
    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
        if n_head_kv is not None and n_head != n_head_kv:
@@ -1340,9 +1290,9 @@ class LlamaModel(Model):
        n_head = self.hparams["num_attention_heads"]
        n_kv_head = self.hparams.get("num_key_value_heads")

-        if name.endswith(("q_proj.weight", "q_proj.bias")):
+        if name.endswith("q_proj.weight"):
            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
-        if name.endswith(("k_proj.weight", "k_proj.bias")):
+        if name.endswith("k_proj.weight"):
            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)

        # process the experts separately
@@ -1799,7 +1749,7 @@ class Phi3MiniModel(Model):
                    token_id = int(token_id)
                    token = foken_data["content"].encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
-                        assert tokens[token_id] == token
+                        assert(tokens[token_id] == token)
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -1815,7 +1765,7 @@ class Phi3MiniModel(Model):
                    token_id = int(foken_data["id"])
                    token = foken_data["content"].encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
-                        assert tokens[token_id] == token
+                        assert(tokens[token_id] == token)
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -1834,59 +1784,23 @@ class Phi3MiniModel(Model):
    def set_gguf_parameters(self):
        block_count = self.find_hparam(["num_hidden_layers", "n_layer"])

+        rot_pct = 1.0
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
-        n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
        rms_eps = self.find_hparam(["rms_norm_eps"])
-        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
-        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rope_dims = n_embd // n_head

        self.gguf_writer.add_name("Phi3")
-        self.gguf_writer.add_context_length(max_pos_embds)
-        self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
+        self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
+
        self.gguf_writer.add_embedding_length(n_embd)
-        self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
+        self.gguf_writer.add_feed_forward_length(8192)
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(n_head)
-        self.gguf_writer.add_head_count_kv(n_head_kv)
+        self.gguf_writer.add_head_count_kv(n_head)
        self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
-        self.gguf_writer.add_rope_dimension_count(rope_dims)
-        self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
+        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
        self.gguf_writer.add_file_type(self.ftype)

-        # write rope scaling for long context (128k) model
-        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if (rope_scaling is None):
-            return
-
-        scale = max_pos_embds / orig_max_pos_embds
-
-        rope_scaling_type = rope_scaling.get('type', '').lower()
-        if len(rope_scaling_type) == 0:
-            raise KeyError('Missing the required key rope_scaling.type')
-
-        if rope_scaling_type == 'su':
-            attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
-        elif rope_scaling_type == 'yarn':
-            attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
-        else:
-            raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet')
-
-        self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
-
-        long_factors = rope_scaling.get('long_factor', None)
-        short_factors = rope_scaling.get('short_factor', None)
-
-        if long_factors is None or short_factors is None:
-            raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
-
-        if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
-            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
-
-        self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG]  + ".weight", np.array(long_factors, dtype=np.float32))
-        self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
-

@Model.register("PlamoForCausalLM")
 class PlamoModel(Model):
@@ -2404,8 +2318,7 @@ class CommandR2Model(Model):

        # max_position_embeddings = 8192 in config.json but model was actually
        # trained on 128k context length
-        # aya-23 models don't have model_max_length specified
-        self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
+        self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]

    def set_gguf_parameters(self):
        super().set_gguf_parameters()
@@ -2478,236 +2391,6 @@ class JinaBertV2Model(BertModel):
        self.gguf_writer.add_add_eos_token(True)


-@Model.register("ArcticForCausalLM")
-class ArcticModel(Model):
-    model_arch = gguf.MODEL_ARCH.ARCTIC
-
-    def set_vocab(self):
-        # The reason for using a custom implementation here is that the
-        # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
-        # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
-        from sentencepiece import SentencePieceProcessor
-
-        tokenizer_path = self.dir_model / 'tokenizer.model'
-
-        if not tokenizer_path.is_file():
-            logger.error(f'Error: Missing {tokenizer_path}')
-            sys.exit(1)
-
-        # Read the whole vocabulary from the tokenizer.model file
-        tokenizer = SentencePieceProcessor()
-        tokenizer.LoadFromFile(str(tokenizer_path))
-
-        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
-
-        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
-        scores: list[float] = [-10000.0] * vocab_size
-        toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
-
-        for token_id in range(tokenizer.vocab_size()):
-
-            piece = tokenizer.IdToPiece(token_id)
-            text = piece.encode("utf-8")
-            score = tokenizer.GetScore(token_id)
-
-            toktype = SentencePieceTokenTypes.NORMAL
-            if tokenizer.IsUnknown(token_id):
-                toktype = SentencePieceTokenTypes.UNKNOWN
-            elif tokenizer.IsControl(token_id):
-                toktype = SentencePieceTokenTypes.CONTROL
-            elif tokenizer.IsUnused(token_id):
-                toktype = SentencePieceTokenTypes.UNUSED
-            elif tokenizer.IsByte(token_id):
-                toktype = SentencePieceTokenTypes.BYTE
-
-            tokens[token_id] = text
-            scores[token_id] = score
-            toktypes[token_id] = toktype
-
-        # Use the added_tokens_decoder field from tokeniser_config.json as the source
-        # of information about added/redefined tokens and modify them accordingly.
-        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
-        if tokenizer_config_file.is_file():
-            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
-                tokenizer_config_json = json.load(f)
-
-                if "added_tokens_decoder" in tokenizer_config_json:
-                    added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
-                    for token_id, token_json in added_tokens_decoder.items():
-                        token_id = int(token_id)
-                        if (token_id >= vocab_size):
-                            logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
-                            continue
-
-                        token_content = token_json["content"]
-                        token_type = SentencePieceTokenTypes.USER_DEFINED
-                        token_score = -10000.0
-
-                        # Map unk_token to UNKNOWN, other special tokens to CONTROL
-                        # Set the score to 0.0 as in the original tokenizer.model
-                        if ("special" in token_json) and token_json["special"]:
-                            if token_content == tokenizer_config_json["unk_token"]:
-                                token_type = SentencePieceTokenTypes.UNKNOWN
-                            else:
-                                token_type = SentencePieceTokenTypes.CONTROL
-                            token_score = 0.0
-
-                        logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
-                        tokens[token_id] = token_content.encode("utf-8")
-                        toktypes[token_id] = token_type
-                        scores[token_id] = token_score
-
-        self.gguf_writer.add_tokenizer_model("llama")
-        self.gguf_writer.add_tokenizer_pre("default")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-        self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
-
-    _experts: list[dict[str, Tensor]] | None = None
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        n_head = self.hparams["num_attention_heads"]
-        n_kv_head = self.hparams.get("num_key_value_heads")
-
-        if name.endswith("q_proj.weight"):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
-        if name.endswith("k_proj.weight"):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
-
-        # process the experts separately
-        if name.find("block_sparse_moe.experts") != -1:
-            n_experts = self.hparams["num_local_experts"]
-
-            assert bid is not None
-
-            if self._experts is None:
-                self._experts = [{} for _ in range(self.block_count)]
-
-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
-                tensors: list[tuple[str, Tensor]] = []
-
-                # merge the experts into a single 3d tensor
-                for wid in ["w1", "w2", "w3"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
-                        datas.append(self._experts[bid][ename])
-                        del self._experts[bid][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-
-                    merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
-
-                    new_name = self.map_tensor_name(merged_name)
-
-                    tensors.append((new_name, data_torch))
-                return tensors
-            else:
-                return []
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-    def write_tensors(self):
-        super().write_tensors()
-
-        if self._experts is not None:
-            # flatten `list[dict[str, Tensor]]` into `list[str]`
-            experts = [k for d in self._experts for k in d.keys()]
-            if len(experts) > 0:
-                raise ValueError(f"Unprocessed experts: {experts}")
-
-
-@Model.register("DeepseekV2ForCausalLM")
-class DeepseekV2Model(Model):
-    model_arch = gguf.MODEL_ARCH.DEEPSEEK2
-
-    def set_vocab(self):
-        self._set_vocab_gpt2()
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-
-        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
-        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-        if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
-            self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
-        self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
-        self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
-        self.gguf_writer.add_value_length(hparams["v_head_dim"])
-        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
-        self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
-        self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
-        self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
-        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
-
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "yarn":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
-                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
-                self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
-
-    _experts: list[dict[str, Tensor]] | None = None
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # process the experts separately
-        if name.find("mlp.experts") != -1:
-            n_experts = self.hparams["n_routed_experts"]
-            assert bid is not None
-
-            if self._experts is None:
-                self._experts = [{} for _ in range(self.block_count)]
-
-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
-                tensors: list[tuple[str, Tensor]] = []
-
-                # merge the experts into a single 3d tensor
-                for w_name in ["down_proj", "gate_proj", "up_proj"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
-                        datas.append(self._experts[bid][ename])
-                        del self._experts[bid][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-
-                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
-
-                    new_name = self.map_tensor_name(merged_name)
-
-                    tensors.append((new_name, data_torch))
-                return tensors
-            else:
-                return []
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-    def write_tensors(self):
-        super().write_tensors()
-
-        if self._experts is not None:
-            # flatten `list[dict[str, Tensor]]` into `list[str]`
-            experts = [k for d in self._experts for k in d.keys()]
-            if len(experts) > 0:
-                raise ValueError(f"Unprocessed experts: {experts}")
-
-
 ###### CONVERSION LOGIC ######


@@ -2840,12 +2523,7 @@ def main() -> None:
    hparams = Model.load_hparams(dir_model)

    with torch.inference_mode():
-        try:
-            model_class = Model.from_model_architecture(hparams["architectures"][0])
-        except NotImplementedError:
-            logger.error(f"Model {hparams['architectures'][0]} is not supported")
-            sys.exit(1)
-
+        model_class = Model.from_model_architecture(hparams["architectures"][0])
        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)

        logger.info("Set model parameters")
--- a/examples/convert-legacy-llama.py
+++ b/examples/convert-legacy-llama.py
@@ -24,16 +24,14 @@ from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional

 import numpy as np
+from sentencepiece import SentencePieceProcessor

 if 'NO_LOCAL_GGUF' not in os.environ:
-    # use .parent.parent since we are in "examples" directory
-    sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
-
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
-from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab

 if TYPE_CHECKING:
    from typing_extensions import Self, TypeAlias
@@ -382,6 +380,306 @@ class Metadata:
        return metadata


+#
+# vocab
+#
+
+
+@runtime_checkable
+class BaseVocab(Protocol):
+    tokenizer_model: ClassVar[str]
+    name: ClassVar[str]
+
+
+class NoVocab(BaseVocab):
+    tokenizer_model = "no_vocab"
+    name = "no_vocab"
+
+    def __repr__(self) -> str:
+        return "<NoVocab for a model without integrated vocabulary>"
+
+
+@runtime_checkable
+class Vocab(BaseVocab, Protocol):
+    vocab_size: int
+    added_tokens_dict: dict[str, int]
+    added_tokens_list: list[str]
+    fname_tokenizer: Path
+
+    def __init__(self, base_path: Path): ...
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
+
+
+class BpeVocab(Vocab):
+    tokenizer_model = "gpt2"
+    name = "bpe"
+
+    def __init__(self, base_path: Path):
+        added_tokens: dict[str, int] = {}
+
+        if (fname_tokenizer := base_path / 'vocab.json').exists():
+            # "slow" tokenizer
+            with open(fname_tokenizer, encoding="utf-8") as f:
+                self.vocab = json.load(f)
+
+            try:
+                # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
+                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
+                    added_tokens = json.load(f)
+            except FileNotFoundError:
+                pass
+        else:
+            # "fast" tokenizer
+            fname_tokenizer = base_path / FAST_TOKENIZER_FILE
+
+            # if this fails, FileNotFoundError propagates to caller
+            with open(fname_tokenizer, encoding="utf-8") as f:
+                tokenizer_json = json.load(f)
+
+            tokenizer_model: dict[str, Any] = tokenizer_json['model']
+            if (
+                tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
+                or tokenizer_json['decoder']['type'] != 'ByteLevel'
+            ):
+                raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
+
+            self.vocab = tokenizer_model["vocab"]
+
+            if (added := tokenizer_json.get('added_tokens')) is not None:
+                # Added tokens here can be duplicates of the main vocabulary.
+                added_tokens = {item['content']: item['id']
+                                for item in added
+                                if item['content'] not in self.vocab}
+
+        vocab_size   = len(self.vocab)
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids   = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            expected_end_id = vocab_size + len(actual_ids) - 1
+            raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
+                             f"{vocab_size} - {expected_end_id}; got {actual_ids}")
+
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_dict    = added_tokens
+        self.added_tokens_list    = [text for (text, idx) in items]
+        self.vocab_size_base      = vocab_size
+        self.vocab_size           = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer      = fname_tokenizer
+
+    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
+
+        for i, _ in enumerate(self.vocab):
+            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.bpe_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class SentencePieceVocab(Vocab):
+    tokenizer_model = "llama"
+    name = "spm"
+
+    def __init__(self, base_path: Path):
+        added_tokens: dict[str, int] = {}
+        if (fname_tokenizer := base_path / 'tokenizer.model').exists():
+            # normal location
+            try:
+                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
+                    added_tokens = json.load(f)
+            except FileNotFoundError:
+                pass
+        elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
+            # not found in alternate location either
+            raise FileNotFoundError('Cannot find tokenizer.model')
+
+        self.sentencepiece_tokenizer = SentencePieceProcessor()
+        self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
+        vocab_size = self.sentencepiece_tokenizer.vocab_size()
+
+        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
+        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
+        actual_new_ids   = sorted(new_tokens.keys())
+
+        if expected_new_ids != actual_new_ids:
+            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
+
+        # Token pieces that were added to the base vocabulary.
+        self.added_tokens_dict  = added_tokens
+        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
+        self.vocab_size_base    = vocab_size
+        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer    = fname_tokenizer
+
+    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        tokenizer = self.sentencepiece_tokenizer
+        for i in range(tokenizer.vocab_size()):
+            piece = tokenizer.IdToPiece(i)
+            text         = piece.encode("utf-8")
+            score: float = tokenizer.GetScore(i)
+
+            toktype = gguf.TokenType.NORMAL
+            if tokenizer.IsUnknown(i):
+                toktype = gguf.TokenType.UNKNOWN
+            if tokenizer.IsControl(i):
+                toktype = gguf.TokenType.CONTROL
+
+            # NOTE: I think added_tokens are user defined.
+            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
+            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
+
+            if tokenizer.IsUnused(i):
+                toktype = gguf.TokenType.UNUSED
+            if tokenizer.IsByte(i):
+                toktype = gguf.TokenType.BYTE
+
+            yield text, score, toktype
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.sentencepiece_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class LlamaHfVocab(Vocab):
+    tokenizer_model = "llama"
+    name = "hfft"
+
+    def __init__(self, base_path: Path):
+        fname_tokenizer = base_path / FAST_TOKENIZER_FILE
+        # if this fails, FileNotFoundError propagates to caller
+        with open(fname_tokenizer, encoding='utf-8') as f:
+            tokenizer_json = json.load(f)
+
+        # pre-check so we know if we need transformers
+        tokenizer_model: dict[str, Any] = tokenizer_json['model']
+        is_llama3 = (
+            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
+            and not tokenizer_model.get('byte_fallback', True)
+        )
+        if is_llama3:
+            raise TypeError('Llama 3 must be converted with BpeVocab')
+
+        if not is_llama3 and (
+            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
+            or tokenizer_json['decoder']['type'] != 'Sequence'
+        ):
+            raise FileNotFoundError('Cannot find Llama BPE tokenizer')
+
+        try:
+            from transformers import AutoTokenizer
+        except ImportError as e:
+            raise ImportError(
+                "To use LlamaHfVocab, please install the `transformers` package. "
+                "You can install it with `pip install transformers`."
+            ) from e
+
+        # Allow the tokenizer to default to slow or fast versions.
+        # Explicitly set tokenizer to use local paths.
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            base_path,
+            cache_dir=base_path,
+            local_files_only=True,
+        )
+        assert self.tokenizer.is_fast  # assume tokenizer.json is used
+
+        # Initialize lists and dictionaries for added tokens
+        self.added_tokens_list = []
+        self.added_tokens_dict = dict()
+        self.added_tokens_ids  = set()
+
+        # Process added tokens
+        for tok, tokidx in sorted(
+            self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
+        ):
+            # Only consider added tokens that are not in the base vocabulary
+            if tokidx >= self.tokenizer.vocab_size:
+                self.added_tokens_list.append(tok)
+                self.added_tokens_dict[tok] = tokidx
+                self.added_tokens_ids.add(tokidx)
+
+        # Store special tokens and their IDs
+        self.specials = {
+            tok: self.tokenizer.get_vocab()[tok]
+            for tok in self.tokenizer.all_special_tokens
+        }
+        self.special_ids = set(self.tokenizer.all_special_ids)
+
+        # Set vocabulary sizes
+        self.vocab_size_base = self.tokenizer.vocab_size
+        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)
+
+        self.fname_tokenizer = fname_tokenizer
+
+    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        reverse_vocab = {
+            id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
+        }
+
+        for token_id in range(self.vocab_size_base):
+            # Skip processing added tokens here
+            if token_id in self.added_tokens_ids:
+                continue
+
+            # Convert token text to bytes
+            token_text = reverse_vocab[token_id].encode("utf-8")
+
+            # Yield token text, score, and type
+            yield token_text, self.get_token_score(token_id), self.get_token_type(
+                token_id, token_text, self.special_ids  # Reuse already stored special IDs
+            )
+
+    def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
+        # Special case for byte tokens
+        if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
+            return gguf.TokenType.BYTE
+
+        # Determine token type based on whether it's a special token
+        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
+
+    def get_token_score(self, token_id: int) -> float:
+        # Placeholder for actual logic to determine the token's score
+        # This needs to be implemented based on specific requirements
+        return -1000.0  # Default score
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            if text in self.specials:
+                toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
+                score = self.get_token_score(self.specials[text])
+            else:
+                toktype = gguf.TokenType.USER_DEFINED
+                score = -1000.0
+
+            yield text.encode("utf-8"), score, toktype
+
+    def has_newline_token(self):
+        return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.hf_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
 #
 # data loading
 # TODO: reuse (probably move to gguf.py?)
--- a/docs/HOWTO-add-model.md
+++ b/docs/HOWTO-add-model.md
@@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
 ### 1. Convert the model to GGUF

 This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
+Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).

 The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.

--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -48,7 +48,7 @@ int main(int argc, char ** argv) {
        params.prompt = "Hello my name is";
    }

-    string_process_escapes(params.prompt);
+    process_escapes(params.prompt);

    // init LLM

--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {

    params.samples_start_after_nl = false;
    params.use_adam               = true;
-    params.use_flash              = false;
+    params.use_flash              = true;
    params.use_scratch            = true;

    // only adam
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -80,7 +80,7 @@ int main(int argc, char ** argv) {

    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
-        params.prompt = string_random_prompt(rng);
+        params.prompt = gpt_random_prompt(rng);
    }

    llama_backend_init();
@@ -107,7 +107,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }

    // split the prompt into lines
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -152,7 +152,7 @@ int main(int argc, char ** argv) {

    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
-        params.prompt = string_random_prompt(rng);
+        params.prompt = gpt_random_prompt(rng);
    }

    llama_backend_init();
@@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }

    bool OK = run(ctx, params);
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        // not capturing these, to silcence warnings
        const int rope_mode = 0;

-        return ggml_rope_ext(ctx,
-            t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
+        return ggml_rope_custom(ctx,
+            t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
            rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
        );
    };
@@ -643,8 +643,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                         set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
        struct ggml_tensor * t16;
        if (enable_flash_attn) {
-            GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
-            //t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                         set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
+            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                         set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
        } else {
            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                  set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);           set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -598,7 +598,7 @@ int main(int argc, char ** argv) {

    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
-        params.prompt = string_random_prompt(rng);
+        params.prompt = gpt_random_prompt(rng);
    }

    sparams.dataset = params.prompt_file;
@@ -667,7 +667,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }

    bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -50,9 +50,9 @@ static void write_logfile(
        return;
    }

-    const std::string timestamp = string_get_sortable_timestamp();
+    const std::string timestamp = get_sortable_timestamp();

-    const bool success = fs_create_directory_with_parents(params.logdir);
+    const bool success = create_directory_with_parents(params.logdir);
    if (!success) {
        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
@@ -70,7 +70,7 @@ static void write_logfile(
    fprintf(logfile, "binary: infill\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
-    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
+    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);

    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
@@ -78,8 +78,8 @@ static void write_logfile(
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");

-    yaml_dump_string_multiline(logfile, "output", output.c_str());
-    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
+    dump_string_yaml_multiline(logfile, "output", output.c_str());
+    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);

    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
@@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        LOG_TEE("\n");
-        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_TEE("%s\n", get_system_info(params).c_str());
    }
    const bool add_bos = llama_should_add_bos_token(model);
    GGML_ASSERT(llama_add_eos_token(model) != 1);
@@ -621,8 +621,8 @@ int main(int argc, char ** argv) {

                if (params.escape) {
                    //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
-                    string_process_escapes(params.input_prefix);
-                    string_process_escapes(params.input_suffix);
+                    process_escapes(params.input_prefix);
+                    process_escapes(params.input_suffix);
                }
                suff_rm_leading_spc = params.escape;
                if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -38,6 +38,7 @@ options:
  -nkvo, --no-kv-offload <0|1>        (default: 0)
  -fa, --flash-attn <0|1>             (default: 0)
  -mmp, --mmap <0|1>                  (default: 1)
+  -dio, --direct-io <0|1>             (default: 0)
  --numa <distribute|isolate|numactl> (default: disabled)
  -embd, --embeddings <0|1>           (default: 0)
  -ts, --tensor-split <ts0/ts1/..>    (default: 0)
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -178,13 +178,13 @@ struct cmd_params {
    std::vector<ggml_type> type_v;
    std::vector<int> n_threads;
    std::vector<int> n_gpu_layers;
-    std::vector<std::string> rpc_servers;
    std::vector<llama_split_mode> split_mode;
    std::vector<int> main_gpu;
    std::vector<bool> no_kv_offload;
    std::vector<bool> flash_attn;
    std::vector<std::vector<float>> tensor_split;
    std::vector<bool> use_mmap;
+    std::vector<bool> use_direct_io;
    std::vector<bool> embeddings;
    ggml_numa_strategy numa;
    int reps;
@@ -196,20 +196,20 @@ static const cmd_params cmd_params_defaults = {
    /* model         */ {"models/7B/ggml-model-q4_0.gguf"},
    /* n_prompt      */ {512},
    /* n_gen         */ {128},
-    /* n_pg          */ {},
+    /* n_pg          */ {{512, 128}},
    /* n_batch       */ {2048},
    /* n_ubatch      */ {512},
    /* type_k        */ {GGML_TYPE_F16},
    /* type_v        */ {GGML_TYPE_F16},
-    /* n_threads     */ {cpu_get_num_math()},
+    /* n_threads     */ {get_math_cpu_count()},
    /* n_gpu_layers  */ {99},
-    /* rpc_servers   */ {""},
    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
    /* main_gpu      */ {0},
    /* no_kv_offload */ {false},
    /* flash_attn    */ {false},
    /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
    /* use_mmap      */ {true},
+    /* use_direct_io */ {false},
    /* embeddings    */ {false},
    /* numa          */ GGML_NUMA_STRATEGY_DISABLED,
    /* reps          */ 5,
@@ -232,12 +232,12 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -ctv, --cache-type-v <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
    printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
    printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
-    printf("  -rpc, --rpc <rpc_servers>           (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
    printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
    printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
    printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
    printf("  -fa, --flash-attn <0|1>             (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
    printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
+    printf("  -dio, --direct-io <0|1>             (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str());
    printf("  --numa <distribute|isolate|numactl> (default: disabled)\n");
    printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
    printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
@@ -387,12 +387,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
-        } else if (arg == "-rpc" || arg == "--rpc") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.rpc_servers.push_back(argv[i]);
        } else if (arg == "-sm" || arg == "--split-mode") {
            if (++i >= argc) {
                invalid_param = true;
@@ -453,6 +447,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<bool>(argv[i], split_delim);
            params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
+        } else if (arg == "-dio" || arg == "--direct-io") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<bool>(argv[i], split_delim);
+            params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end());
        } else if (arg == "-embd" || arg == "--embeddings") {
            if (++i >= argc) {
                invalid_param = true;
@@ -528,13 +529,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
    if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
-    if (params.rpc_servers.empty())  { params.rpc_servers = cmd_params_defaults.rpc_servers; }
    if (params.split_mode.empty())   { params.split_mode = cmd_params_defaults.split_mode; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
    if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
    if (params.flash_attn.empty())   { params.flash_attn = cmd_params_defaults.flash_attn; }
    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
    if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
+    if (params.use_direct_io.empty()){ params.use_direct_io = cmd_params_defaults.use_direct_io; }
    if (params.embeddings.empty())   { params.embeddings = cmd_params_defaults.embeddings; }
    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }

@@ -551,26 +552,24 @@ struct cmd_params_instance {
    ggml_type type_v;
    int n_threads;
    int n_gpu_layers;
-    std::string rpc_servers;
    llama_split_mode split_mode;
    int main_gpu;
    bool no_kv_offload;
    bool flash_attn;
    std::vector<float> tensor_split;
    bool use_mmap;
+    bool use_direct_io;
    bool embeddings;

    llama_model_params to_llama_mparams() const {
        llama_model_params mparams = llama_model_default_params();

        mparams.n_gpu_layers = n_gpu_layers;
-        if (!rpc_servers.empty()) {
-            mparams.rpc_servers = rpc_servers.c_str();
-        }
        mparams.split_mode = split_mode;
        mparams.main_gpu = main_gpu;
        mparams.tensor_split = tensor_split.data();
        mparams.use_mmap = use_mmap;
+        mparams.use_direct_io = use_direct_io;

        return mparams;
    }
@@ -578,10 +577,10 @@ struct cmd_params_instance {
    bool equal_mparams(const cmd_params_instance & other) const {
        return model == other.model &&
               n_gpu_layers == other.n_gpu_layers &&
-               rpc_servers == other.rpc_servers &&
               split_mode == other.split_mode &&
               main_gpu == other.main_gpu &&
               use_mmap == other.use_mmap &&
+               use_direct_io == other.use_direct_io &&
               tensor_split == other.tensor_split;
    }

@@ -607,11 +606,11 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    // this ordering minimizes the number of times that each model needs to be reloaded
    for (const auto & m : params.model)
    for (const auto & nl : params.n_gpu_layers)
-    for (const auto & rpc : params.rpc_servers)
    for (const auto & sm : params.split_mode)
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
    for (const auto & mmp : params.use_mmap)
+    for (const auto & dio : params.use_direct_io)
    for (const auto & embd : params.embeddings)
    for (const auto & nb : params.n_batch)
    for (const auto & nub : params.n_ubatch)
@@ -634,13 +633,13 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
-                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
                /* .flash_attn   = */ fa,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
+                /* .use_direct_io= */ dio,
                /* .embeddings   = */ embd,
            };
            instances.push_back(instance);
@@ -660,13 +659,13 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
-                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
                /* .flash_attn   = */ fa,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
+                /* .use_direct_io= */ dio,
                /* .embeddings   = */ embd,
            };
            instances.push_back(instance);
@@ -686,13 +685,13 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
-                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
                /* .flash_attn   = */ fa,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
+                /* .use_direct_io= */ dio,
                /* .embeddings   = */ embd,
            };
            instances.push_back(instance);
@@ -711,7 +710,6 @@ struct test {
    static const bool kompute;
    static const bool metal;
    static const bool sycl;
-    static const bool rpc;
    static const bool gpu_blas;
    static const bool blas;
    static const std::string cpu_info;
@@ -732,6 +730,7 @@ struct test {
    bool flash_attn;
    std::vector<float> tensor_split;
    bool use_mmap;
+    bool use_direct_io;
    bool embeddings;
    int n_prompt;
    int n_gen;
@@ -757,6 +756,7 @@ struct test {
        flash_attn = inst.flash_attn;
        tensor_split = inst.tensor_split;
        use_mmap = inst.use_mmap;
+        use_direct_io = inst.use_direct_io;
        embeddings = inst.embeddings;
        n_prompt = inst.n_prompt;
        n_gen = inst.n_gen;
@@ -810,9 +810,6 @@ struct test {
        if (sycl) {
            return GGML_SYCL_NAME;
        }
-        if (rpc) {
-            return "RPC";
-        }
        if (gpu_blas) {
            return "GPU BLAS";
        }
@@ -826,14 +823,14 @@ struct test {
    static const std::vector<std::string> & get_fields() {
        static const std::vector<std::string> fields = {
            "build_commit", "build_number",
-            "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
+            "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_ubatch",
            "n_threads", "type_k", "type_v",
            "n_gpu_layers", "split_mode",
            "main_gpu", "no_kv_offload", "flash_attn",
-            "tensor_split", "use_mmap", "embeddings",
+            "tensor_split", "use_mmap", "use_direct_io", "embeddings",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
            "avg_ts", "stddev_ts"
@@ -854,7 +851,7 @@ struct test {
        }
        if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
-            field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
+            field == "flash_attn" || field == "use_mmap" || field == "use_direct_io" || field == "embeddings") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
@@ -882,14 +879,14 @@ struct test {
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
            std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
-            std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
+            std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_ubatch),
            std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
            std::to_string(n_gpu_layers), split_mode_str(split_mode),
            std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
-            tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
+            tensor_split_str, std::to_string(use_mmap), std::to_string(use_direct_io), std::to_string(embeddings),
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
            std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -917,7 +914,6 @@ const bool        test::metal        = !!ggml_cpu_has_metal();
 const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
 const bool        test::blas         = !!ggml_cpu_has_blas();
 const bool        test::sycl         = !!ggml_cpu_has_sycl();
-const bool        test::rpc          = !!ggml_cpu_has_rpc();
 const std::string test::cpu_info     = get_cpu_info();
 const std::string test::gpu_info     = get_gpu_info();

@@ -1066,6 +1062,9 @@ struct markdown_printer : public printer {
        if (field == "use_mmap") {
            return "mmap";
        }
+        if (field == "use_direct_io") {
+            return "direct_io";
+        }
        if (field == "embeddings") {
            return "embd";
        }
@@ -1118,6 +1117,9 @@ struct markdown_printer : public printer {
        if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
            fields.emplace_back("use_mmap");
        }
+        if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
+            fields.emplace_back("use_direct_io");
+        }
        if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
            fields.emplace_back("embeddings");
        }
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@@ -7,6 +7,8 @@ android {
    namespace = "com.example.llama"
    compileSdk = 34

+    ndkVersion = "26.1.10909125"
+
    defaultConfig {
        applicationId = "com.example.llama"
        minSdk = 33
@@ -18,6 +20,17 @@ android {
        vectorDrawables {
            useSupportLibrary = true
        }
+        ndk {
+            // Add NDK properties if wanted, e.g.
+            // abiFilters += listOf("arm64-v8a")
+        }
+        externalNativeBuild {
+            cmake {
+                arguments += "-DCMAKE_BUILD_TYPE=Release"
+                cppFlags += listOf()
+                arguments += listOf()
+            }
+        }
    }

    buildTypes {
@@ -42,6 +55,17 @@ android {
    composeOptions {
        kotlinCompilerExtensionVersion = "1.5.1"
    }
+    packaging {
+        resources {
+            excludes += "/META-INF/{AL2.0,LGPL2.1}"
+        }
+    }
+    externalNativeBuild {
+        cmake {
+            path = file("src/main/cpp/CMakeLists.txt")
+            version = "3.22.1"
+        }
+    }
 }

 dependencies {
@@ -54,7 +78,6 @@ dependencies {
    implementation("androidx.compose.ui:ui-graphics")
    implementation("androidx.compose.ui:ui-tooling-preview")
    implementation("androidx.compose.material3:material3")
-    implementation(project(":llama"))
    testImplementation("junit:junit:4.13.2")
    androidTestImplementation("androidx.test.ext:junit:1.1.5")
    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
--- a/examples/llama.android/app/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/app/src/main/cpp/CMakeLists.txt
@@ -42,7 +42,7 @@ add_subdirectory(../../../../../../ build-llama)
 # used in the AndroidManifest.xml file.
 add_library(${CMAKE_PROJECT_NAME} SHARED
    # List C/C++ source files with relative paths to this CMakeLists.txt.
-        llama-android.cpp)
+    llama-android.cpp)

 # Specifies libraries CMake should link to your target library. You
 # can link libraries from various origins, such as libraries defined in this
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {

 extern "C"
 JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
+Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
    llama_model_params model_params = llama_model_default_params();

    auto path_to_model = env->GetStringUTFChars(filename, 0);
@@ -101,13 +101,13 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi

 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
+Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
    llama_free_model(reinterpret_cast<llama_model *>(model));
 }

 extern "C"
 JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
+Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
    auto model = reinterpret_cast<llama_model *>(jmodel);

    if (!model) {
@@ -139,25 +139,25 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo

 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
+Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
    llama_free(reinterpret_cast<llama_context *>(context));
 }

 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
+Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
    llama_backend_free();
 }

 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
+Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
    llama_log_set(log_callback, NULL);
 }

 extern "C"
 JNIEXPORT jstring JNICALL
-Java_android_llama_cpp_LLamaAndroid_bench_1model(
+Java_com_example_llama_Llm_bench_1model(
        JNIEnv *env,
        jobject,
        jlong context_pointer,
@@ -271,13 +271,13 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(

 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
+Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
 }

 extern "C"
 JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
+Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {

    // Source: Copy of llama.cpp:llama_batch_init but heap-allocated.

@@ -313,19 +313,19 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,

 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
+Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
    llama_backend_init();
 }

 extern "C"
 JNIEXPORT jstring JNICALL
-Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
+Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
    return env->NewStringUTF(llama_print_system_info());
 }

 extern "C"
 JNIEXPORT jint JNICALL
-Java_android_llama_cpp_LLamaAndroid_completion_1init(
+Java_com_example_llama_Llm_completion_1init(
        JNIEnv *env,
        jobject,
        jlong context_pointer,
@@ -376,7 +376,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(

 extern "C"
 JNIEXPORT jstring JNICALL
-Java_android_llama_cpp_LLamaAndroid_completion_1loop(
+Java_com_example_llama_Llm_completion_1loop(
        JNIEnv * env,
        jobject,
        jlong context_pointer,
@@ -438,6 +438,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(

 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
+Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
    llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
 }
--- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@@ -1,4 +1,4 @@
-package android.llama.cpp
+package com.example.llama

 import android.util.Log
 import kotlinx.coroutines.CoroutineDispatcher
@@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
 import java.util.concurrent.Executors
 import kotlin.concurrent.thread

-class LLamaAndroid {
+class Llm {
    private val tag: String? = this::class.simpleName

    private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
@@ -165,8 +165,8 @@ class LLamaAndroid {
        }

        // Enforce only one instance of Llm.
-        private val _instance: LLamaAndroid = LLamaAndroid()
+        private val _instance: Llm = Llm()

-        fun instance(): LLamaAndroid = _instance
+        fun instance(): Llm = _instance
    }
 }
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
@@ -1,6 +1,5 @@
 package com.example.llama

-import android.llama.cpp.LLamaAndroid
 import android.util.Log
 import androidx.compose.runtime.getValue
 import androidx.compose.runtime.mutableStateOf
@@ -10,7 +9,7 @@ import androidx.lifecycle.viewModelScope
 import kotlinx.coroutines.flow.catch
 import kotlinx.coroutines.launch

-class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
+class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
    companion object {
        @JvmStatic
        private val NanosPerSecond = 1_000_000_000.0
@@ -29,7 +28,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan

        viewModelScope.launch {
            try {
-                llamaAndroid.unload()
+                llm.unload()
            } catch (exc: IllegalStateException) {
                messages += exc.message!!
            }
@@ -45,7 +44,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
        messages += ""

        viewModelScope.launch {
-            llamaAndroid.send(text)
+            llm.send(text)
                .catch {
                    Log.e(tag, "send() failed", it)
                    messages += it.message!!
@@ -58,7 +57,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
        viewModelScope.launch {
            try {
                val start = System.nanoTime()
-                val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
+                val warmupResult = llm.bench(pp, tg, pl, nr)
                val end = System.nanoTime()

                messages += warmupResult
@@ -71,7 +70,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
                    return@launch
                }

-                messages += llamaAndroid.bench(512, 128, 1, 3)
+                messages += llm.bench(512, 128, 1, 3)
            } catch (exc: IllegalStateException) {
                Log.e(tag, "bench() failed", exc)
                messages += exc.message!!
@@ -82,7 +81,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
    fun load(pathToModel: String) {
        viewModelScope.launch {
            try {
-                llamaAndroid.load(pathToModel)
+                llm.load(pathToModel)
                messages += "Loaded $pathToModel"
            } catch (exc: IllegalStateException) {
                Log.e(tag, "load() failed", exc)
--- a/examples/llama.android/build.gradle.kts
+++ b/examples/llama.android/build.gradle.kts
@@ -2,5 +2,4 @@
 plugins {
    id("com.android.application") version "8.2.0" apply false
    id("org.jetbrains.kotlin.android") version "1.9.0" apply false
-    id("com.android.library") version "8.2.0" apply false
 }
--- a/examples/llama.android/llama/.gitignore
+++ b/examples/llama.android/llama/.gitignore
@@ -1 +0,0 @@
-/build
--- a/examples/llama.android/llama/build.gradle.kts
+++ b/examples/llama.android/llama/build.gradle.kts
@@ -1,68 +0,0 @@
-plugins {
-    id("com.android.library")
-    id("org.jetbrains.kotlin.android")
-}
-
-android {
-    namespace = "android.llama.cpp"
-    compileSdk = 34
-
-    defaultConfig {
-        minSdk = 33
-
-        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
-        consumerProguardFiles("consumer-rules.pro")
-        ndk {
-            // Add NDK properties if wanted, e.g.
-            // abiFilters += listOf("arm64-v8a")
-        }
-        externalNativeBuild {
-            cmake {
-                arguments += "-DCMAKE_BUILD_TYPE=Release"
-                cppFlags += listOf()
-                arguments += listOf()
-
-                cppFlags("")
-            }
-        }
-    }
-
-    buildTypes {
-        release {
-            isMinifyEnabled = false
-            proguardFiles(
-                getDefaultProguardFile("proguard-android-optimize.txt"),
-                "proguard-rules.pro"
-            )
-        }
-    }
-    externalNativeBuild {
-        cmake {
-            path("src/main/cpp/CMakeLists.txt")
-            version = "3.22.1"
-        }
-    }
-    compileOptions {
-        sourceCompatibility = JavaVersion.VERSION_1_8
-        targetCompatibility = JavaVersion.VERSION_1_8
-    }
-    kotlinOptions {
-        jvmTarget = "1.8"
-    }
-
-    packaging {
-        resources {
-            excludes += "/META-INF/{AL2.0,LGPL2.1}"
-        }
-    }
-}
-
-dependencies {
-
-    implementation("androidx.core:core-ktx:1.12.0")
-    implementation("androidx.appcompat:appcompat:1.6.1")
-    implementation("com.google.android.material:material:1.11.0")
-    testImplementation("junit:junit:4.13.2")
-    androidTestImplementation("androidx.test.ext:junit:1.1.5")
-    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
-}
--- a/examples/llama.android/llama/consumer-rules.pro
+++ b/examples/llama.android/llama/consumer-rules.pro
--- a/examples/llama.android/llama/proguard-rules.pro
+++ b/examples/llama.android/llama/proguard-rules.pro
@@ -1,21 +0,0 @@
-# Add project specific ProGuard rules here.
-# You can control the set of applied configuration files using the
-# proguardFiles setting in build.gradle.
-#
-# For more details, see
-#   http://developer.android.com/guide/developing/tools/proguard.html
-
-# If your project uses WebView with JS, uncomment the following
-# and specify the fully qualified class name to the JavaScript interface
-# class:
-#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
-#   public *;
-#}
-
-# Uncomment this to preserve the line number information for
-# debugging stack traces.
-#-keepattributes SourceFile,LineNumberTable
-
-# If you keep the line number information, uncomment this to
-# hide the original source file name.
-#-renamesourcefileattribute SourceFile
--- a/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
+++ b/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
@@ -1,24 +0,0 @@
-package android.llama.cpp
-
-import androidx.test.platform.app.InstrumentationRegistry
-import androidx.test.ext.junit.runners.AndroidJUnit4
-
-import org.junit.Test
-import org.junit.runner.RunWith
-
-import org.junit.Assert.*
-
-/**
- * Instrumented test, which will execute on an Android device.
- *
- * See [testing documentation](http://d.android.com/tools/testing).
- */
-@RunWith(AndroidJUnit4::class)
-class ExampleInstrumentedTest {
-    @Test
-    fun useAppContext() {
-        // Context of the app under test.
-        val appContext = InstrumentationRegistry.getInstrumentation().targetContext
-        assertEquals("android.llama.cpp.test", appContext.packageName)
-    }
-}
--- a/examples/llama.android/llama/src/main/AndroidManifest.xml
+++ b/examples/llama.android/llama/src/main/AndroidManifest.xml
@@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android">
-
-</manifest>
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -1,49 +0,0 @@
-# For more information about using CMake with Android Studio, read the
-# documentation: https://d.android.com/studio/projects/add-native-code.html.
-# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
-
-# Sets the minimum CMake version required for this project.
-cmake_minimum_required(VERSION 3.22.1)
-
-# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
-# Since this is the top level CMakeLists.txt, the project name is also accessible
-# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
-# build script scope).
-project("llama-android")
-
-include(FetchContent)
-FetchContent_Declare(
-        llama
-        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-        GIT_TAG        master
-)
-
-# Also provides "common"
-FetchContent_MakeAvailable(llama)
-
-# Creates and names a library, sets it as either STATIC
-# or SHARED, and provides the relative paths to its source code.
-# You can define multiple libraries, and CMake builds them for you.
-# Gradle automatically packages shared libraries with your APK.
-#
-# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
-# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
-# is preferred for the same purpose.
-#
-# In order to load a library into your app from Java/Kotlin, you must call
-# System.loadLibrary() and pass the name of the library defined here;
-# for GameActivity/NativeActivity derived applications, the same library name must be
-# used in the AndroidManifest.xml file.
-add_library(${CMAKE_PROJECT_NAME} SHARED
-        # List C/C++ source files with relative paths to this CMakeLists.txt.
-        llama-android.cpp)
-
-# Specifies libraries CMake should link to your target library. You
-# can link libraries from various origins, such as libraries defined in this
-# build script, prebuilt third-party libraries, or Android system libraries.
-target_link_libraries(${CMAKE_PROJECT_NAME}
-        # List libraries link to the target library
-        llama
-        common
-        android
-        log)
--- a/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
+++ b/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
@@ -1,17 +0,0 @@
-package android.llama.cpp
-
-import org.junit.Test
-
-import org.junit.Assert.*
-
-/**
- * Example local unit test, which will execute on the development machine (host).
- *
- * See [testing documentation](http://d.android.com/tools/testing).
- */
-class ExampleUnitTest {
-    @Test
-    fun addition_isCorrect() {
-        assertEquals(4, 2 + 2)
-    }
-}
--- a/examples/llama.android/settings.gradle.kts
+++ b/examples/llama.android/settings.gradle.kts
@@ -15,4 +15,3 @@ dependencyResolutionManagement {

 rootProject.name = "LlamaAndroid"
 include(":app")
-include(":llama")
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -54,10 +54,10 @@ python ./examples/llava/convert-image-encoder-to-gguf \
    --projector-type ldpv2
 ```

-4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
+4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
+python ./convert.py path/to/MobileVLM-1.7B
 ```

 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -50,10 +50,10 @@ python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
 python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```

-5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown
+python ./convert.py ../llava-v1.5-7b --skip-unknown
 ```

 Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
@@ -92,7 +92,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto

 6) Then convert the model to gguf format:
 ```console
-python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
+python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```

 7) And finally we can run the llava-cli using the 1.6 model version:
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -68,7 +68,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);

-/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
+/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
 CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );

 CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS

    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        gpt_params_print_usage(argc, argv, params);
+        gpt_print_usage(argc, argv, params);
        show_additional_info(argc, argv);
        return 1;
    }
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@@ -1,3 +1,3 @@
-r ../../requirements/requirements-convert-legacy-llama.txt
+-r ../../requirements/requirements-convert.txt
 pillow~=10.2.0
 torch~=2.1.1
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -174,7 +174,7 @@ int main(int argc, char ** argv) {
        // debug
        if (dump_kv_cache) {
            llama_kv_cache_view_update(ctx, &kvc_view);
-            llama_kv_cache_dump_view_seqs(kvc_view, 40);
+            dump_kv_cache_view_seqs(kvc_view, 40);
        }

        // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -121,7 +121,7 @@ int main(int argc, char ** argv){
        // debug
        if (dump_kv_cache) {
            llama_kv_cache_view_update(ctx, &kvc_view);
-            llama_kv_cache_dump_view_seqs(kvc_view, 40);
+            dump_kv_cache_view_seqs(kvc_view, 40);
        }

        // print current draft sequence
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -282,6 +282,10 @@ These options help improve the performance and memory usage of the LLaMA models.

 -   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.

+### Direct I/O
+
+-   `--direct-io`: Use direct I/O. Potentially faster uncached loading, fewer pageouts, no page cache pollution. You may benefit from this option if you load a model for the first time (or after some time), load several different models consecutively, or simply want to keep the page cache clean. The faster your storage device is, the greater the gain you can expect. The effect may be greater on Linux due to Transparent HugePage support.
+
 ### NUMA support

 -   `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
@@ -325,5 +329,3 @@ These options provide extra functionality and customization when running the LLa
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
-
-   `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable  or in an OS-specific local cache.
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -60,9 +60,9 @@ static void write_logfile(
        return;
    }

-    const std::string timestamp = string_get_sortable_timestamp();
+    const std::string timestamp = get_sortable_timestamp();

-    const bool success = fs_create_directory_with_parents(params.logdir);
+    const bool success = create_directory_with_parents(params.logdir);
    if (!success) {
        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
@@ -80,7 +80,7 @@ static void write_logfile(
    fprintf(logfile, "binary: main\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
-    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
+    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);

    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
@@ -88,8 +88,8 @@ static void write_logfile(
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");

-    yaml_dump_string_multiline(logfile, "output", output.c_str());
-    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
+    dump_string_yaml_multiline(logfile, "output", output.c_str());
+    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);

    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
@@ -181,7 +181,7 @@ int main(int argc, char ** argv) {

    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
-        params.prompt = string_random_prompt(rng);
+        params.prompt = gpt_random_prompt(rng);
    }

    LOG("%s: llama backend init\n", __func__);
@@ -219,7 +219,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        LOG_TEE("\n");
-        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_TEE("%s\n", get_system_info(params).c_str());
    }

    std::string path_session = params.path_prompt_cache;
@@ -474,12 +474,12 @@ int main(int argc, char ** argv) {
    LOG_TEE("\n\n");

    if (params.interactive) {
-        const char * control_message;
+        const char *control_message;
        if (params.multiline_input) {
-            control_message = " - To return control to the AI, end your input with '\\'.\n"
+            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
                              " - To return control without starting a new line, end your input with '/'.\n";
        } else {
-            control_message = " - Press Return to return control to the AI.\n"
+            control_message = " - Press Return to return control to LLaMa.\n"
                              " - To return control without starting a new line, end your input with '/'.\n"
                              " - If you want to submit another line, end your input with '\\'.\n";
        }
@@ -707,7 +707,7 @@ int main(int argc, char ** argv) {

            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);

-            llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
+            llama_sampling_accept(ctx_sampling, ctx, id, true);

            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());

@@ -728,7 +728,7 @@ int main(int argc, char ** argv) {

                // push the prompt in the sampling context in order to apply repetition penalties later
                // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
+                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);

                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
@@ -740,26 +740,18 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo && display) {
            for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id, params.special);
+                const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
+                printf("%s", token_str.c_str());

-                // Console/Stream Output
-                fprintf(stdout, "%s", token_str.c_str());
-
-                // Record Displayed Tokens To Log
-                // Note: Generated tokens are created one by one hence this check
                if (embd.size() > 1) {
-                    // Incoming Requested Tokens
                    input_tokens.push_back(id);
                } else {
-                    // Outgoing Generated Tokens
                    output_tokens.push_back(id);
                    output_ss << token_str;
                }
-
-                fflush(stdout);
            }
+            fflush(stdout);
        }
-
        // reset color to default if there is no pending user input
        if (input_echo && (int) embd_inp.size() == n_consumed) {
            console::set_display(console::reset);
@@ -887,7 +879,7 @@ int main(int argc, char ** argv) {
                        embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
                    }
                    if (params.escape) {
-                        string_process_escapes(buffer);
+                        process_escapes(buffer);
                    }

                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
--- a/examples/make-ggml.py
+++ b/examples/make-ggml.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+"""
+This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
+
+Usage:
+python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
+
+Arguments:
+- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
+- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
+- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
+- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
+- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
+- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
+
+Old quant types (some base model types require these):
+- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
+- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
+- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
+- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
+
+New quant types (recommended):
+- Q2_K: smallest, extreme quality loss - not recommended
+- Q3_K: alias for Q3_K_M
+- Q3_K_S: very small, very high quality loss
+- Q3_K_M: very small, very high quality loss
+- Q3_K_L: small, substantial quality loss
+- Q4_K: alias for Q4_K_M
+- Q4_K_S: small, significant quality loss
+- Q4_K_M: medium, balanced quality - recommended
+- Q5_K: alias for Q5_K_M
+- Q5_K_S: large, low quality loss - recommended
+- Q5_K_M: large, very low quality loss - recommended
+- Q6_K: very large, extremely low quality loss
+- Q8_0: very large, extremely low quality loss - not recommended
+- F16: extremely large, virtually no quality loss - not recommended
+- F32: absolutely huge, lossless - not recommended
+"""
+import subprocess
+subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
+
+import argparse
+import os
+from huggingface_hub import snapshot_download
+
+def main(model, model_type, outname, outdir, quants, keep_fp16):
+    if not os.path.isdir(model):
+        print(f"Model not found at {model}. Downloading...")
+        try:
+            if outname is None:
+                outname = model.split('/')[-1]
+            model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
+        except Exception as e:
+            raise Exception(f"Could not download the model: {e}")
+
+    if outdir is None:
+        outdir = f'../models/{outname}'
+
+    if not os.path.isfile(f"{model}/config.json"):
+        raise Exception(f"Could not find config.json in {model}")
+
+    os.makedirs(outdir, exist_ok=True)
+
+    print("Building llama.cpp")
+    subprocess.run(f"cd .. && make quantize", shell=True, check=True)
+
+    fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
+
+    print(f"Making unquantised GGUF at {fp16}")
+    if not os.path.isfile(fp16):
+        if model_type != "llama":
+            subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
+        else:
+            subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
+    else:
+        print(f"Unquantised GGML already exists at: {fp16}")
+
+    print("Making quants")
+    for type in quants:
+        outfile = f"{outdir}/{outname}.gguf.{type}.bin"
+        print(f"Making {type} : {outfile}")
+        subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
+
+    if not keep_fp16:
+        os.remove(fp16)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
+    parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
+    parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
+    parser.add_argument('--outname', default=None, help='Output model(s) name')
+    parser.add_argument('--outdir', default=None, help='Output directory')
+    parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
+    parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
+
+    args = parser.parse_args()
+
+    main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -210,7 +210,7 @@ int main(int argc, char ** argv) {
    while (true) {
        if (dump_kv_cache) {
            llama_kv_cache_view_update(ctx, &kvc_view);
-            llama_kv_cache_dump_view_seqs(kvc_view, 40);
+            dump_kv_cache_view_seqs(kvc_view, 40);
        }

        llama_batch_clear(batch);
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -44,9 +44,9 @@ static void write_logfile(
        return;
    }

-    const std::string timestamp = string_get_sortable_timestamp();
+    const std::string timestamp = get_sortable_timestamp();

-    const bool success = fs_create_directory_with_parents(params.logdir);
+    const bool success = create_directory_with_parents(params.logdir);
    if (!success) {
        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
@@ -64,7 +64,7 @@ static void write_logfile(
    fprintf(logfile, "binary: main\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
-    yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
+    dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc);

    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
@@ -72,9 +72,9 @@ static void write_logfile(
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");

-    yaml_dump_vector_float(logfile, "logits", results.logits);
+    dump_vector_float_yaml(logfile, "logits", results.logits);
    fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
-    yaml_dump_vector_float(logfile, "probs", results.probs);
+    dump_vector_float_yaml(logfile, "probs", results.probs);

    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
@@ -2007,7 +2007,7 @@ int main(int argc, char ** argv) {

    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
-        params.prompt = string_random_prompt(rng);
+        params.prompt = gpt_random_prompt(rng);
    }

    llama_backend_init();
@@ -2035,7 +2035,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }

    struct results_perplexity results;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -259,7 +259,7 @@ int main(int argc, char ** argv) {
                usage(argv[0]);
            }
        } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
-            if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
+            if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) {
                usage(argv[0]);
            }
        } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -11,7 +11,7 @@ struct retrieval_params {
 };

 static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
-    gpt_params_print_usage(argc, argv, gpt_params);
+    gpt_print_usage(argc, argv, gpt_params);
    printf("retrieval options:\n");
    printf("  --context-file FNAME  file containing context to embed.\n");
    printf("                        specify multiple files by providing --context-file option multiple times.\n");
@@ -226,7 +226,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }

    // max batch size
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -6,10 +6,6 @@
 #include "ggml-metal.h"
 #endif

-#ifdef GGML_USE_SYCL
-#include "ggml-sycl.h"
-#endif
-
 #include "ggml-rpc.h"
 #ifdef _WIN32
 #  include <windows.h>
@@ -83,12 +79,6 @@ static ggml_backend_t create_backend() {
    if (!backend) {
        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
    }
-#elif GGML_USE_SYCL
-    fprintf(stderr, "%s: using SYCL backend\n", __func__);
-    backend = ggml_backend_sycl_init(0); // init device 0
-    if (!backend) {
-        fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
-    }
 #endif

    // if there aren't GPU Backends fallback to CPU backend
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -8,20 +8,9 @@ set(TARGET_SRCS
    httplib.h
 )
 set(PUBLIC_ASSETS
-    colorthemes.css
-    style.css
-    theme-beeninorder.css
-    theme-ketivah.css
-    theme-mangotango.css
-    theme-playground.css
-    theme-polarnight.css
-    theme-snowstorm.css
    index.html
-    index-new.html
    index.js
    completion.js
-    system-prompts.js
-    prompt-formats.js
    json-schema-to-grammar.mjs
 )
 foreach(asset ${PUBLIC_ASSETS})
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -34,6 +34,7 @@ The project is under active development, and we are [looking for feedback and co
 - `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512`
 - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
+- `--direct-io`: Use direct I/O. Potentially faster uncached loading, fewer pageouts, no page cache pollution.
 - `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
 - `--numa distribute`: Spread execution evenly over all nodes
 - `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
--- a/examples/server/public/colorthemes.css
+++ b/examples/server/public/colorthemes.css
@@ -1,402 +0,0 @@
-@import url("theme-snowstorm.css");
-@import url("theme-polarnight.css");
-@import url("theme-ketivah.css");
-@import url("theme-mangotango.css");
-@import url("theme-playground.css");
-@import url("theme-beeninorder.css");
-
-:root {
-/* ---------- PRIMARY COLORS ----------------- */
--primary-color-1: hsl(217.5, 26.7%, 94.1%);
-    --primary-color-1-hue:             217.5;
-    --primary-color-1-saturation:      26.7%;
-    --primary-color-1-lightness:       94.1%;
-
--primary-color-2: hsl(218.2, 26.8%, 92.0%);
-    --primary-color-2-hue:             218.2;
-    --primary-color-2-saturation:      26.8%;
-    --primary-color-2-lightness:       92.0%;
-
--primary-color-3: hsl(218.8, 27.9%, 88.0%);
-    --primary-color-3-hue:             218.8;
-    --primary-color-3-saturation:      27.9%;
-    --primary-color-3-lightness:       88.0%;
-
--primary-color-4: hsl(218.8, 18.3%, 81.8%);
-    --primary-color-4-hue:             218.8;
-    --primary-color-4-saturation:      18.3%;
-    --primary-color-4-lightness:       81.8%;
-
-
-/* ---------- SECONDARY COLORS --------------- */
--secondary-color-1: hsl(220.0, 16.4%, 21.6%);
-    --secondary-color-1-hue:             220.0;
-    --secondary-color-1-saturation:      16.4%;
-    --secondary-color-1-lightness:       21.6%;
-
--secondary-color-2: hsl(221.7, 16.3%, 27.6%);
-    --secondary-color-2-hue:             221.7;
-    --secondary-color-2-saturation:      16.3%;
-    --secondary-color-2-lightness:       27.6%;
-
--secondary-color-3: hsl(220.0, 16.8%, 31.6%);
-    --secondary-color-3-hue:             220.0;
-    --secondary-color-3-saturation:      16.8%;
-    --secondary-color-3-lightness:       31.6%;
-
--secondary-color-4: hsl(220.0, 16.5%, 35.7%);
-    --secondary-color-4-hue:             220.0;
-    --secondary-color-4-saturation:      16.5%;
-    --secondary-color-4-lightness:       35.7%;
-
-
-
-/* ----------- NUANCES COLORS ---------------- */
--theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%);
-    --theme-nuance-color-1-hue:             178.7;
-    --theme-nuance-color-1-saturation:      25.1%;
-    --theme-nuance-color-1-lightness:       64.9%;
-
--theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%);
-    --theme-nuance-color-2-hue:             193.3;
-    --theme-nuance-color-2-saturation:      43.4%;
-    --theme-nuance-color-2-lightness:       67.5%;
-
--theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%);
-    --theme-nuance-color-3-hue:             210.0;
-    --theme-nuance-color-3-saturation:      34.0%;
-    --theme-nuance-color-3-lightness:       63.1%;
-
--theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%);
-    --theme-nuance-color-4-hue:             213.1;
-    --theme-nuance-color-4-saturation:      32.0%;
-    --theme-nuance-color-4-lightness:       52.2%;
-
-
-
-/* ----------- ROYGP COLORS ------------------ */
--theme-red-color:    hsl(32.5, 80%, 50%);
--theme-orange-color: hsl(32.5, 70%, 45%);
--theme-yellow-color: hsl(40.0,   0.6%, 73.3%);
--theme-green-color:  hsl(92.4,  27.8%, 64.7%);
--theme-purple-color: hsl(311.1, 20.2%, 63.1%);
-
-
-
-/* ------------------------------------------- */
--background-color-1:    var(--primary-color-1);
--background-color-2:    var(--primary-color-2);
--background-color-3:    var(--primary-color-3);
--background-color-4:    var(--primary-color-4);
-
--border-color-1:        var(--primary-color-2);
--border-color-2:        var(--primary-color-3);
--border-color-3:        var(--primary-color-4);
-
--border-focus-color:    var(--theme-nuance-color-2);
--border-focus-shadow:   var(--theme-nuance-color-1);
-
--text-color-plain:      var(--secondary-color-1);
--text-color-subtile-1:  var(--secondary-color-2);
--text-color-subtile-2:  var(--secondary-color-3);
-
--code-background-color: var(--secondary-color-2);
--code-text-color:       var(--primary-color-2);
-
--ui-range-thumb-color:  var(--theme-nuance-color-3);
--ui-range-thumb-border: var(--ui-ranger-thumb-color);
-
--textarea-border-color: var(--secondary-color-4);
-
--chat-id-color:         var(--theme-nuance-color-4);
-
-
-
-/* ------------------------------------------- */
--button-alert-text-hover:       var(--primary-color-1);
--button-alert-color-hover:      var(--theme-orange-color);
--button-alert-border-hover:     var(--theme-orange-color);
-
--button-alert-text-active:      var(--primary-color-1);
--button-alert-color-active:     var(--theme-red-color);
--button-alert-border-active:    var(--theme-red-color);
-
-
-
-/* ----------- PRIMARY BUTTONS --------------- */
-/* - button should immediately catch the eye - */
--button-primary-text:   var(--secondary-color-1);
--button-primary-color:  var(--theme-nuance-color-3);
--button-primary-border: var(--theme-nuance-color-3);
-
-
-/* ---------hover---------- */
--button-primary-text-hover:
-    hsl(217.5,
-    calc(var(--secondary-color-1-saturation) + 35%),
-    calc(var(--secondary-color-1-lightness)  - 30%));
-
--button-primary-color-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) -  2%),
-    calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
--button-primary-border-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) -  2%),
-    calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
-
-/* ---------active--------- */
--button-primary-text-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  + 35%));
-
--button-primary-color-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 10%),
-    calc(var(--theme-nuance-color-3-lightness)  - 25%));
-
--button-primary-border-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 10%),
-    calc(var(--theme-nuance-color-3-lightness)  - 25%));
-
-
-
-/* ---------- SECONDARY BUTTONS -------------- */
-/* these should NOT immediately catch the eye  */
--button-secondary-text:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  - 50%));
-
--button-secondary-color:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  + 10%));
-
--button-secondary-border:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  + 10%));
-
-
-/* ---------hover---------- */
--button-secondary-text-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  - 80%));
-
--button-secondary-color-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 22%),
-    calc(var(--theme-nuance-color-3-lightness)  +  1%));
-
--button-secondary-border-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 22%),
-    calc(var(--theme-nuance-color-3-lightness)  +  1%));
-
-
-/* ---------active--------- */
--button-secondary-text-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) + 40%),
-    calc(var(--theme-nuance-color-3-lightness)  - 55%));
-
--button-secondary-color-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 30%),
-    calc(var(--theme-nuance-color-3-lightness)  -  5%));
-
--button-secondary-border-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 30%),
-    calc(var(--theme-nuance-color-3-lightness)  -  5%));
-
-
-
-/* ---------- TERTIARY BUTTONS --------------- */
-/* ---------- disabled buttons --------------- */
--button-tertiary-text:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  -  5%));
-
--button-tertiary-color:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  + 20%));
-
--button-tertiary-border:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  + 20%));
-
-/* ---------hover---------- */
--button-tertiary-text-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  -  5%));
-
--button-tertiary-color-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  + 20%));
-
--button-tertiary-border-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  + 20%));
-}
-
-/*
-
-.theme-template {
-
-
-    If light theme: should go from bright to darker
-    If dark theme: should go from dark to brighter
-    ideally this should not be anything but steps of
-    gray or slightly variants from it
-
-    --primary-color-1: #2E3440;
-    --primary-color-2: #3B4252;
-    --primary-color-3: #434C5E;
-    --primary-color-4: #4C566A;
-
-
-
-    If light theme: should go from dark to brighter
-    If dark theme: should go from bright to darker
-    ideally this should not be anything but steps of
-    gray or slightly variants from it
-
-    --secondary-color-1: #ECEFF4;
-    --secondary-color-2: #E5E9F0;
-    --secondary-color-3: #D8DEE9;
-    --secondary-color-4: #C8CED9;
-
-
-
-    Choose wisely nuance colors. It is not easy to find
-    4 harmonizing nuance colors. But keep in mind, that
-    only one accent color could work too.
-
-    --theme-nuance-color-1: #8FBCBB;
-    --theme-nuance-color-2: #88C0D0;
-    --theme-nuance-color-3: #81A1C1;
-    --theme-nuance-color-4: #5E81AC;
-
-
-
-    adapt the color red, orange, yellow, green,
-    purple to the 'mood' of your overall design
-    e.g is it low-contrast? vibrant? dynamic? etc
-
-    --theme-red-color:    #BF616A;
-    --theme-orange-color: #D08770;
-    --theme-yellow-color: #EBCB8B;
-    --theme-green-color:  #A3BE8C;
-    --theme-purple-color: #B48EAD;
-
-
-
-NOTE: comment all those line `--- ...` out
------------------------------------------------
--background-color-1:
--background-color-2:
--background-color-3:
--background-color-4:
-
--border-color-1:
--border-color-2:
--border-color-3:
-
--border-focus-color:
--border-focus-shadow:
-
--text-color-plain:
--text-color-subtile-1:
--text-color-subtile-2:
-
--code-background-color:
--code-text-color:
-
--ui-range-thumb-color:
--ui-range-thumb-border:
-
--textarea-border-color:
-
-
-
-------------------------------------------
--button-alert-text-hover:
--button-alert-color-hover:
--button-alert-border-hover:
-
--button-alert-text-active:
--button-alert-color-active:
--button-alert-border-active:
-
-
-
----------- PRIMARY -----------------------
--button should immediately catch the eye--
-
--button-primary-text:
--button-primary-color:
--button-primary-border:
-
-
---------hover----------
--button-primary-text-hover:
--button-primary-color-hover:
--button-primary-border-hover:
-
-
---------active---------
--button-primary-text-active:
--button-primary-color-active:
--button-primary-border-active:
-
-
-
------------ SECONDARY ------------------------
--button should NOT immediately catch the eye--
-
--button-secondary-text:
--button-secondary-color:
--button-secondary-border:
-
-
---------hover----------
--button-secondary-text-hover:
--button-secondary-color-hover:
--button-secondary-border-hover:
-
-
---------active---------
--button-secondary-text-active:
--button-secondary-color-active:
--button-secondary-border-active:
-
-
-
---------- TERTIARY -----------------------
---------- disabled buttons ---------------
--button-tertiary-text:
--button-tertiary-color:
--button-tertiary-border:
-
-
---------hover----------
--button-tertiary-text:
--button-tertiary-color:
--button-tertiary-border:
-
-}
-
-*/
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -12,18 +12,6 @@
      font-size: 90%;
    }

-    .grid-container {
-      display: grid;
-      grid-template-columns: auto auto auto;
-      padding: 10px;
-    }
-
-    .grid-item {
-      padding: 5px;
-      /* font-size: 30px; */
-      text-align: center;
-    }
-
    #container {
      margin: 0em auto;
      display: flex;
@@ -47,67 +35,6 @@
      padding: 0.5em;
    }

-    h1 {
-      text-align: center;
-    }
-
-    .customlink:link {
-      color: white;
-      background-color: #007aff;
-      font-weight: 600;
-      text-decoration: none;
-      float: right;
-      margin-top: 30px;
-      display: flex;
-      flex-direction: row;
-      gap: 0.5em;
-      justify-content: flex-end;
-      border-radius: 4px;
-      padding: 8px;
-    }
-
-    .customlink:visited {
-      color: white;
-      background-color: #007aff;
-      font-weight: 600;
-      text-decoration: none;
-      float: right;
-      margin-top: 30px;
-      display: flex;
-      flex-direction: row;
-      gap: 0.5em;
-      justify-content: flex-end;
-      padding: 8px;
-    }
-
-    .customlink:hover {
-      color: white;
-      background-color: #0070ee;
-      font-weight: 600;
-      text-decoration: none;
-      float: right;
-      margin-top: 30px;
-      display: flex;
-      flex-direction: row;
-      gap: 0.5em;
-      justify-content: flex-end;
-      padding: 8px;
-    }
-
-    .customlink:active {
-      color: #0070ee;
-      background-color: #80b3ef;
-      font-weight: 600;
-      text-decoration: none;
-      float: right;
-      margin-top: 30px;
-      display: flex;
-      flex-direction: row;
-      gap: 0.5em;
-      justify-content: flex-end;
-      padding: 8px;
-    }
-
    body {
      max-width: 600px;
      min-width: 300px;
@@ -667,7 +594,7 @@
          message = html`<${Probabilities} data=${data} />`
        } else {
          const text = isArrayMessage ?
-            data.map(msg => msg.content).join('') :
+            data.map(msg => msg.content).join('').replace(/^\s+/, '') :
            data;
          message = isCompletionMode ?
            text :
@@ -950,30 +877,19 @@

    // poor mans markdown replacement
    const Markdownish = (params) => {
-      const chunks = params.text.split('```');
-
-      for (let i = 0; i < chunks.length; i++) {
-        if (i % 2 === 0) { // outside code block
-          chunks[i] = chunks[i]
-          .replace(/&/g, '&amp;')
-          .replace(/</g, '&lt;')
-          .replace(/>/g, '&gt;')
-          .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
-          .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
-          .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
-          .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
-          .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
-          .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
-          .replace(/`(.*?)`/g, '<code>$1</code>')
-          .replace(/\n/gim, '<br />');
-        } else { // inside code block
-          chunks[i] = `<pre><code>${chunks[i]}</code></pre>`;
-        }
-      }
-
-      const restoredText = chunks.join('');
-
-      return html`<span dangerouslySetInnerHTML=${{ __html: restoredText }} />`;
+      const md = params.text
+        .replace(/&/g, '&amp;')
+        .replace(/</g, '&lt;')
+        .replace(/>/g, '&gt;')
+        .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
+        .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+        .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+        .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
+        .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
+        .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
+        .replace(/`(.*?)`/g, '<code>$1</code>')
+        .replace(/\n/gim, '<br />');
+      return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
    };

    const ModelGenerationInfo = (params) => {
@@ -987,7 +903,6 @@
      `
    }

-
    // simple popover impl
    const Popover = (props) => {
      const isOpen = useSignal(false);
@@ -1108,11 +1023,7 @@
      return html`
        <div class="mode-${session.value.type}">
          <header>
-            <div class="grid-container">
-              <div class="grid-item"></div>
-              <div class="grid-item"><h1>llama.cpp</h1></div>
-              <div class="grid-item"><a class="customlink" href="index-new.html">New UI</a></div>
-            </div>
+            <h1>llama.cpp</h1>
          </header>

          <main id="content">
@@ -1143,3 +1054,4 @@
 </body>

 </html>
+
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/examples/server/public/prompt-formats.js
+++ b/examples/server/public/prompt-formats.js
@@ -1,331 +0,0 @@
-// extended list
-export const promptFormats = {
-  "alpaca": {
-  template: `{{prompt}}\n\n{{history}}\n\n{{char}}:`,
-
-  historyTemplate: `### {{name}}:\n{{message}}`,
-
-  char: "Response",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "Instruction",
-  userMsgPrefix: "",
-  userMsgSuffix: "",
-
-  stops: ""
-  },
-
-  // ----------------------------
-
-  "chatml": {
-  template: `<|im_start|>system\n{{prompt}}<|im_end|>\n{{history}}{{char}}`,
-
-  historyTemplate: `<|im_start|>{{name}}\n{{message}}`,
-
-  char: "assistant",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "user",
-  userMsgPrefix: "",
-  userMsgSuffix: "<|im_end|>\n",
-
-  stops: ""
-  },
-
-  // ----------------------------
-
-  "commandr": {
-  template: `<BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{prompt}}\n<|END_OF_TURN_TOKEN|>{{history}}{{char}}`,
-
-  historyTemplate: `<|START_OF_TURN_TOKEN|><|{{name}}|> {{message}}`,
-
-  char: "CHATBOT_TOKEN",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "USER_TOKEN",
-  userMsgPrefix: "",
-  userMsgSuffix: "<|END_OF_TURN_TOKEN|>",
-
-  stops: ""
-  },
-  // ref: https://docs.cohere.com/docs/prompting-command-r
-
-  // ----------------------------
-
-  "llama2": {
-  template: `<s>[INST] <<SYS>>\n{{prompt}}\n<</SYS>>\n\nTest Message [/INST] Test Successfull </s>{{history}}{{char}}`,
-
-  historyTemplate: `{{name}}: {{message}}`,
-
-  char: "Assistant",
-  charMsgPrefix: "",
-  charMsgSuffix: "</s>",
-
-  user: "User",
-  userMsgPrefix: "<s>[INST] ",
-  userMsgSuffix: " [/INST]",
-
-  stops: ""
-  },
-  // ref: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
-
-  // ----------------------------
-
-  "llama3": {
-  template: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{{prompt}}{{history}}{{char}}`,
-
-  historyTemplate: `<|start_header_id|>{{name}}<|end_header_id|>\n\n{{message}}<|eot_id|>`,
-
-  char: "assistant",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "user",
-  userMsgPrefix: "",
-  userMsgSuffix: "",
-
-  stops: "<|eot_id|>"
-  },
-  // ref: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/#special-tokens-used-with-meta-llama-3
-
-  // ----------------------------
-
-  "openchat": {
-  template: `{{history}}{{char}}`,
-
-  historyTemplate: `GPT4 Correct {{name}}: {{message}}<|end_of_turn|>`,
-
-  char: "Assistant",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "User",
-  userMsgPrefix: "",
-  userMsgSuffix: "",
-
-  stops: ""
-  },
-
-  // ----------------------------
-
-  "phi3": {
-  template: `{{history}}{{char}}`,
-
-  historyTemplate: `<|{{name}}|>\n{{message}}<|end|>\n`,
-
-  char: "assistant",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "user",
-  userMsgPrefix: "",
-  userMsgSuffix: "",
-
-  stops: "<|end|>"
-  },
-  // ref: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct#chat-format
-
-  // ----------------------------
-
-  "vicuna": {
-  template: `{{prompt}}\n{{history}}{{char}}`,
-
-  historyTemplate: `{{name}}: {{message}}\n`,
-
-  char: "ASSISTANT",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "USER",
-  userMsgPrefix: "",
-  userMsgSuffix: "",
-
-  stops: ""
-  },
-  // ref: https://huggingface.co/lmsys/vicuna-33b-v1.3/discussions/1
-
-  // ----------------------------
-
-  "deepseekCoder": {
-  template: `{{prompt}}{{history}}{{char}}:`,
-
-  historyTemplate: `### {{name}}:\n{{message}}`,
-
-  char: "Response",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "Instruction",
-  userMsgPrefix: "",
-  userMsgSuffix: "",
-
-  stops: "<|EOT|>"
-  },
-
-  // ----------------------------
-
-  "med42": {
-  template: `<|system|>: {{prompt}}\n{{history}}{{char}}`,
-
-  historyTemplate: `<|{{name}}|>: {{message}}\n`,
-
-  char: "assistant",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "prompter",
-  userMsgPrefix: "",
-  userMsgSuffix: "",
-
-  stops: ""
-  },
-
-  // ----------------------------
-
-  "neuralchat": {
-  template: `### System:\n{{prompt}}\n{{history}}{{char}}:`,
-
-  historyTemplate: `### {{name}}:\n{{message}}\n`,
-
-  char: "Assistant",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "User",
-  userMsgPrefix: "",
-  userMsgSuffix: "",
-
-  stops: ""
-  },
-
-  // ----------------------------
-
-  "nousHermes": {
-  template: `### Instruction: {{prompt}}\n\n{{history}}\n\n{{char}}:`,
-
-  historyTemplate: `### {{name}}:\n{{message}}`,
-
-  char: "Response",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "Input",
-  userMsgPrefix: "",
-  userMsgSuffix: "",
-
-  stops: ""
-  },
-
-  // ----------------------------
-
-  "openchatMath": {
-  template: `{{history}}{{char}}`,
-
-  historyTemplate: `Math Correct {{name}}: {{message}}<|end_of_turn|>`,
-
-  char: "Assistant",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-
-  user: "User",
-  userMsgPrefix: "",
-  userMsgSuffix: "",
-
-  stops: ""
-  },
-
-  // ----------------------------
-
-  "orion": {
-  template: `<s>Human: Test Message\n\nAssistant: </s>Test Successful</s>{{history}}{{char}}:`,
-
-  historyTemplate: `{{name}}: {{message}}`,
-
-  char: "Assistant </s>",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "Human",
-  userMsgPrefix: "",
-  userMsgSuffix: "\n\n",
-
-  stops: ""
-  },
-
-  // ----------------------------
-
-  "sauerkraut": {
-  template: `{{prompt}}\n{{history}}{{char}}`,
-
-  historyTemplate: `
-  {{name}}: {{message}}\n`,
-
-  char: "Assistant",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "User",
-  userMsgPrefix: "",
-  userMsgSuffix: "",
-
-  stops: ""
-  },
-
-  // ----------------------------
-
-  "starlingCode": {
-  template: `{{history}}{{char}}`,
-
-  historyTemplate: `Code {{name}}: {{message}}<|end_of_turn|>`,
-
-  char: "Assistant",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "User",
-  userMsgPrefix: "",
-  userMsgSuffix: "",
-
-  stops: ""
-  },
-
-  // ----------------------------
-
-  "yi34b": {
-  template: `{{history}} {{char}}`,
-
-  historyTemplate: `{{name}}: {{message}}`,
-
-  char: "Assistant",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "Human",
-  userMsgPrefix: "",
-  userMsgSuffix: "",
-
-  stops: ""
-  },
-
-  // ----------------------------
-
-  "zephyr": {
-  template: `<|system|>\n{{prompt}}</s>\n{{history}}{{char}}`,
-
-  historyTemplate: `<|{{name}}|>\n{{message}}</s>\n`,
-
-  char: "assistant",
-  charMsgPrefix: "",
-  charMsgSuffix: "",
-
-  user: "user",
-  userMsgPrefix: "",
-  userMsgSuffix: "",
-
-  stops: ""
-  }
-  };
--- a/examples/server/public/style.css
+++ b/examples/server/public/style.css
@@ -1,954 +0,0 @@
-@import url("colorthemes.css");
-
-body {
-  font-family: 'Arial', sans-serif;
-  font-size: 90%;
-  background-color: var(--background-color-1);
-  color: var(--text-color-subtile-1); /* head 1 llama.cpp & triangle options for some reason */
-  max-width: 600px;
-  min-width: 300px;
-  line-height: 1.2;
-  margin: 0 auto;
-  padding: 0 0.5em;
-  transition: background-color 0.3s;
-}
-
-::selection {
-  color: var(--button-primary-text) ;
-  background: var(--button-primary-color);
-}
-
-code, pre code {
-  font-family: 'Courier New', monospace;
-}
-
-#container {
-  margin: 0em auto;
-  display: flex;
-  flex-direction: column;
-  justify-content: space-between;
-  height: 100%;
-}
-
-main {
-  margin: 3px;
-  display: flex;
-  flex-direction: column;
-  justify-content: space-between;
-  gap: 1em;
-  flex-grow: 1;
-  overflow-y: auto;
-  border: 1px solid var(--border-color-3);
-  border-radius: 5px;
-  padding: 0.5em;
-}
-
-p {
-  overflow-wrap: break-word;
-  word-wrap: break-word;
-  hyphens: auto;
-  margin-top: 0.5em;
-  margin-bottom: 0.5em;
-}
-
-#write form {
-  margin: 1em 0 0 0;
-  display: flex;
-  flex-direction: column;
-  gap: 0.5em;
-  align-items: stretch;
-}
-
-.right {
-  display: flex;
-  flex-direction: row;
-  gap: 0.5em;
-  justify-content: flex-end;
-  margin-bottom: 30px;
-}
-
-.two-columns {
-  width: 97%;
-  max-width: 97%;
-  display: grid;
-  grid-template-columns: 1fr 1fr;
-  gap: 1em;
-  position: relative;
-}
-
-.json-schema-controls {
-  margin-top: 10px;
-  width: 100%;
-  max-width: 100%;
-  display: grid;
-  grid-template: "a a";
-  gap: 1em;
-  font-size: x-small;
-  color: var(--theme-nuance-color-3);
-  padding-top: 16px;
-  padding-bottom: 16px;
-  text-transform: uppercase;
-  font-weight: 600;
-}
-
-.json-schema-controls > * {
-  flex: 1;
-}
-
-/* titles of the details-summary boxes */
-.summary-title {
-  font-weight: 600;
-  font-size: x-small;
-  color: var(--text-color-subtile-1);
-  text-transform: uppercase;
-  /* transition: ; */
-}
-
-fieldset {
-  border: none;
-  padding: 0;
-  margin: 0;
-  color: var(--text-color-plain);
-}
-
-fieldset.two {
-  display: grid;
-  grid-template: "a a a";
-  gap: 1em;
-  align-items: center;
-  font-size: x-small;
-  color: var(--text-color-plain);
-}
-
-fieldset.three {
-  display: grid;
-  grid-template: "a a a";
-  gap: 1em;
-  font-size: x-small;
-  color: var(--text-color-plain);
-}
-
-/* titles of name fields*/
-fieldset.names {
-  display: grid;
-  grid-template: "a a";
-  gap: 1em;
-  font-size: x-small;
-  color: var(--theme-nuance-color-3);
-  padding-top: 16px;
-  padding-bottom: 16px;
-  text-transform: uppercase;
-  font-weight: 600;
-}
-
-/* titles of params fields*/
-fieldset.params {
-  display: grid;
-  grid-template: "a a";
-  gap: 1em;
-  font-size: x-small;
-  color: var(--theme-nuance-color-4);
-  padding-top: 16px;
-  padding-bottom: 16px;
-  text-transform: uppercase;
-  font-weight: 600;
-}
-
-fieldset.dropdowns {
-  -webkit-appearance: none;
-  display: flex;
-  grid-template: "a a";
-  gap: 1em;
-  font-size: x-small;
-  color: red;
-  padding-top: 16px;
-  padding-bottom: 16px;
-  text-transform: uppercase;
-  font-weight: 600;
-}
-
-/* input of name fields*/
-.names input[type="text"] {
-  font-family: Arial, sans-serif;
-  font-size: medium;
-  font-weight: 500;
-  padding: 5px;
-  border: 1px solid var(--border-color-2);
-}
-
-.chat-id-color {
-  color: var(--chat-id-color);
-}
-
-details {
-  border: 1px solid var(--border-color-2);
-  border-radius: 5px;
-  padding: 0.5em 0.5em 0;
-  margin-top: 0.5em;
-}
-
-summary {
-  font-weight: bold;
-  margin: -0.5em -0.5em 0;
-  padding: 0.5em;
-  cursor: pointer;
-}
-
-details[open] {
-  padding: 0.5em;
-}
-
-textarea-sec, input-sec, button-sec {
-  padding: 10px;
-  height: 40px;
-  align-items: center;
-}
-
-textarea-sec::placeholder, input-sec::placeholder {
-  padding-left: 10px;
-}
-
-.toggleCheckbox {
-  display: none;
-}
-
-.toggleContainer {
-  position: relative;
-  display: grid;
-  grid-template-columns: repeat(2, 1fr);
-  width: fit-content;
-  border: 3px solid var(--border-color-2);
-  border-radius: 20px;
-  background: var(--border-color-2);
-  font-size: small;
-  cursor: pointer;
-  overflow: hidden;
-}
-
-/* toggle button current state */
-.toggleContainer::before {
-  color: var(--button-primary-text);
-  background-color: var(--button-primary-color);
-  content: '';
-  position: absolute;
-  width: 50%;
-  height: 100%;
-  left: 0%;
-  border-radius: 20px;
-  transition: all 0.3s;
-}
-
-.toggleContainer div {
-  padding: 6px;
-  text-align: center;
-  z-index: 1;
-  transition: color 0.3s;
-}
-
-.toggleCheckbox:checked + .toggleContainer::before {
-  left: 50%;
-}
-
-.toggleCheckbox:checked + .toggleContainer div:first-child {
-  color: var(--text-color-subtile-2);
-}
-
-.toggleCheckbox:checked + .toggleContainer div:last-child {
-  color: var(--button-primary-text);
-}
-
-.toggleCheckbox + .toggleContainer div:first-child {
-  color: var(--button-primary-text);
-}
-
-.toggleCheckbox + .toggleContainer div:last-child {
-  color: var(--text-color-subtile-2);
-}
-
-select {
-  padding: 5px;
-  margin-right: 5px;
-  border-radius: 4px;
-  border: 1px solid var(--secondary-color-4);
-  background-color: var(--primary-color-3);
-  color: var(--secondary-color-4);
-  cursor: pointer;
-}
-
-select:focus {
-  border: 1px solid var(--border-focus-color);
-  box-shadow: 0 0 1px var(--border-focus-shadow);
-}
-
-.button-container {
-  display: flex;
-  justify-content: flex-end;
-}
-
-button {
-  color: var(--button-primary-text);
-  background-color: var(--button-primary-color);
-  border: 1px solid var(--button-primary-border);
-  transition: background-color 0.1s;
-  border-radius: 12px;
-  font-size: x-small;
-  font-weight: 600;
-  text-shadow: 0px 0px 30px #ffffff;
-  text-align: center;
-  text-decoration: none;
-  margin: 4px 2px;
-  padding: 10px 20px;
-  display: inline-block;
-  cursor: pointer;
-}
-
-button:hover {
-  color: var(--button-primary-text-hover);
-  background-color: var(--button-primary-color-hover);
-  border: 1px solid var(--button-primary-border-hover);
-  font-size: x-small;
-  font-weight: 600;
-}
-
-button:active {
-  color: var(--button-primary-text-active);
-  background-color: var(--button-primary-color-active);
-  border: 1px solid var(--button-primary-border-active);
-  font-size: x-small;
-  font-weight: 600;
-}
-
-button:disabled {
-  color: var(--button-tertiary-text);
-  background-color: var(--button-tertiary-color);
-  border: 1px solid var(--button-tertiary-border);
-  font-size: x-small;
-  font-weight: 600;
-  cursor: not-allowed;
-}
-
-.reset-button {
-  background-color: var(--button-secondary-color);
-  border: 1px solid var(--button-secondary-color);
-  color: var(--button-secondary-text);
-  width: fit-content;
-  height: fit-content;
-  font-size: x-small;
-  font-weight: 600;
-  border-radius: 50px;
-  overflow: hidden;
-}
-
-.reset-button:hover {
-  color: var(--button-alert-text-hover);
-  background-color: var(--button-alert-color-hover);
-  border: 1px solid var(--button-alert-border-hover);
-  font-size: x-small;
-  font-weight: 600;
-}
-
-.reset-button:active {
-  color: var(--button-alert-text-active);
-  background-color: var(--button-alert-color-active);
-  border: 1px solid var(--button-alert-border-active);
-  font-size: x-small;
-  font-weight: 600;
-}
-
-.button-grammar {
-  color: var(--button-primary-text);
-  background-color: var(--button-primary-color);
-  border: 1px solid var(--button-primary-border);
-  border-radius: 10px;
-  padding: 10px 20px;
-  text-align: center;
-  text-decoration: none;
-  display: inline-block;
-  font-size: x-small;
-  font-weight: 600;
-  margin: 2px 2px;
-  transition: background-color 0.1s;
-  cursor: pointer;
-}
-
-.button-grammar:hover {
-  color: var(--button-primary-text-hover);
-  background-color: var(--button-primary-color-hover);
-  border: 1px solid var(--button-primary-border-hover);
-  border-radius: 10px;
-  padding: 10px 20px;
-  text-align: center;
-  text-decoration: none;
-  display: inline-block;
-  font-size: x-small;
-  font-weight: 600;
-  margin: 2px 2px;
-  transition: background-color 0.1s;
-  cursor: pointer;
-}
-
-.button-grammar:active {
-  color: var(--button-primary-text-active);
-  background-color: var(--button-primary-color-active);
-  border: 1px solid var(--button-primary-border-active);
-  font-size: x-small;
-  font-weight: 600;
-}
-
-.button-back {
-  background-color: var(--button-secondary-color);
-  border: 1px solid var(--button-secondary-color);
-  color: var(--button-secondary-text);
-  transition: background-color 0.1s;
-  border-radius: 12px;
-  font-size: x-small;
-  font-weight: 600;
-  text-align: center;
-  text-decoration: none;
-  margin: 4px 2px;
-  padding: 10px 20px;
-  display: inline-block;
-  cursor: pointer;
-}
-
-.button-back:hover {
-  color: var(--button-secondary-text-hover);
-  background-color: var(--button-secondary-color-hover);
-  border: 1px solid var(--button-secondary-border-hover);
-  padding: 10px 20px;
-  text-align: center;
-  text-decoration: none;
-  display: inline-block;
-  font-size: x-small;
-  font-weight: 600;
-  margin: 4px 2px;
-  transition: background-color 0.1s;
-  cursor: pointer;
-  border-radius: 12px;
-}
-
-.button-back:active {
-  color: var(--button-secondary-text-active);
-  background-color: var(--button-secondary-color-active);
-  border: 1px solid var(--button-secondary-border-active);
-  font-size: x-small;
-  font-weight: 600;
-}
-
-.prob-set {
-  padding: 0.3em;
-  border-bottom: 1px solid red; /* unknown */
-}
-
-.popover-content {
-  position: absolute;
-  background-color: white;
-  padding: 0.2em;
-  box-shadow: 0 0 13px rgba(0, 0, 0, 0.1);
-}
-
-.grammar {
-  width: 97%;
-  max-width: 97%;
-}
-
-textarea {
-  padding: 5px;
-  flex-grow: 1;
-  width: 100%;
-  max-width: 100%;
-  border-radius: 8px;
-  border: 1px solid var(--border-color-1);
-  resize: none;
-  height: 6em;
-}
-
-textarea:focus {
-  outline: none;
-  border: 1px solid var(--border-focus-color);
-  box-shadow: 0 0 3px var(--border-focus-shadow);
-}
-
-/* "props" frame */
-input[type="text"],
-input[type="range"] {
-  padding: 5px;
-  border-radius: 8px;
-  border: 1px solid var(--border-color-1);
-}
-
-/* "names and props" frame focused*/
-input[type="text"]:focus {
-  outline: none;
-  border: 1px solid var(--border-focus-color);
-  box-shadow: 0 0 3px var(--border-focus-shadow);
-}
-
-input[type="range"]:hover {
-  opacity: 1;
-}
-
-input[type="range"]:focus {
-  outline: none;
-  border: 1px solid var(--border-focus-color);
-  box-shadow: 0 0 3px var(--border-focus-shadow);
-  background-size: var(--slider-track-size-focus);
-}
-
-input[type="range"]::-moz-range-thumb {
-  width: 6px;
-  height: 25px;
-  border: 1px solid var(--ui-range-thumb-border);
-  border-radius: 5px;
-  background-color: var(--ui-range-thumb-color);
-  cursor: pointer;
-}
-
-input[type="range"] {
-  -webkit-appearance: none;
-  width: 80%;
-  height: 1px;
-  border: 1px solid var(--border-color-1);
-  border-radius: 8px;
-  background: var(--border-color-2);
-  outline: none;
-  opacity: 0.7;
-  -webkit-transition: .2s;
-  transition: opacity .2s;
-}
-
-input[type="range"]::-webkit-slider-thumb {
-  -webkit-appearance: none;
-  appearance: none;
-  width: 6px;
-  height: 25px;
-  border: 1px solid var(--ui-range-thumb-border);
-  border-radius: 5px;
-  background-color: var(--ui-range-thumb-color);
-  cursor: pointer;
-}
-
-input[type="range"]::-webkit-slider-runnable-track {
-  background-size: var(--slider-track-size);
-}
-
-input[type="radio"] {
-  accent-color:   var(--theme-nuance-color-2);
-}
-
-.chat-input-container {
-  position: relative;
-  max-width: 97%;
-  min-width: 97%;
-}
-
-.chat-input-label {
-  position: absolute;
-  top: 0;
-  left: 0;
-  color: var(--text-color-plain);
-  pointer-events: none;
-  margin-left: 5px;
-  margin-top: 5px;
-}
-
-textarea#chat-input {
-  padding-top: 10px;
-  padding-left: 10px;
-  font-size: medium;
-  border: 1px solid var(--border-color-2);
-  resize: vertical;
-}
-
-textarea#chat-input:focus {
-  border: 1px solid var(--border-focus-color);
-  box-shadow: 0 0 3px var(--border-focus-shadow);
-}
-
-.input-container {
-  position: relative;
-  box-sizing: border-box;
-  width: 100%; /* Setzt die Breite auf 100% */
-  max-width: 100%; /* Stellt sicher, dass die Breite nicht größer als 100% wird */
-}
-
-.input-container:focus {
-  border: 1px solid var(--border-focus-color);
-  box-shadow: 0 0 3px var(--border-focus-shadow);
-}
-/* titles of name fields*/
-/* fieldset.names {
-  display: grid;
-  grid-template: "a a";
-  gap: 1em;
-  font-size: x-small;
-  color: var(--theme-nuance-color-3);
-  padding-top: 16px;
-  padding-bottom: 16px;
-  text-transform: uppercase;
-  font-weight: 600;
-} */
-
-/* input of name fields*/
-/* .names input[type="text"] {
-  font-family: Arial, sans-serif;
-  font-size: medium;
-  font-weight: 500;
-  padding: 5px;
-  border: 1px solid var(--border-color-2);
-} */
-
-fieldset.apiKey {
-  width: 100%;
-  font-size: x-small;
-  color: var(--theme-nuance-color-3);
-  padding-top: 16px;
-  padding-bottom: 16px;
-  text-transform: uppercase;
-  font-weight: 600;
-}
-
-.apiKey {
-  font-family: Arial, sans-serif;
-  font-weight: 500;
-  padding: 5px;
-  border: 1px solid var(--border-color-2);
-}
-
-.apiKey:focus {
-  border: 1px solid var(--border-focus-color);
-  box-shadow: 0 0 3px var(--border-focus-shadow);
-}
-
-.apiKey input[type="text"] {
-  font-family: Arial, sans-serif;
-  font-size: medium;
-  font-weight: 500;
-  padding: 5px;
-  border: 1px solid var(--border-color-2);
-}
-
-.apiKey label {
-  display: inline-block;
-  width: auto;
-  margin-right: 5px;
-}
-
-textarea#api_key {
-  padding-top: 10px;
-  padding-left: 10px;
-  font-size: medium;
-  border: 1px solid var(--border-color-2);
-  resize: vertical;
-}
-
-textarea#api_key:focus {
-  border: 1px solid var(--border-focus-color);
-  box-shadow: 0 0 3px var(--border-focus-shadow);
-}
-
-/* embedded title of the system prompt text area */
-.input-label {
-  position: absolute;
-  top: 0;
-  left: 0;
-  color: var(--theme-nuance-color-4);
-  pointer-events: none;
-  border-radius: 8px 8px 0px 0px;
-  padding-top: 10px;
-  padding-left: 13px;
-  padding-right: 0px;
-  margin-top: 1px;
-  margin-left: 1px;
-  margin-right: 20px;
-  text-transform: uppercase;
-  font-weight: 600;
-  font-size: small;
-  background: rgba(255, 255, 255, 0.5);
-  backdrop-filter: blur(10px);
-  -webkit-backdrop-filter: blur(10px); /* for safari */
-  width: 97%;
-  /* display: block;
-  box-sizing: border-box; */
-}
-
-/* embedded title of the prompt style areas */
-.input-label-sec {
-  position: absolute;
-  top: 0;
-  left: 0;
-  color: var(--theme-nuance-color-4);
-  pointer-events: none;
-  margin-left: 13px;
-  margin-top: 16px;
-  text-transform: uppercase;
-  font-weight: 600;
-  font-size: x-small;
-}
-
-/* system prompt input area */
-textarea.persistent-input {
-  padding-top: 42px;
-  padding-left: 11px;
-  width: 97%;
-  max-width: 97%;
-  height: 50px;
-  font-size: medium;
-  overscroll-behavior: contain;
-}
-
-/* system prompt box */
-.persistent-input {
-  height: auto;
-  width: 100%;
-  max-width: 100%;
-  min-height: 50px;
-  padding: 3px;
-  transition: min-height 0.3s ease;
-}
-
-/* chat history box */
-.persistent-input:focus {
-  height: auto;
-  min-height: 150px;
-  border: 1px solid var(--border-focus-color);
-  box-shadow: 0 0 3px var(--border-focus-shadow);
-}
-
-textarea.persistent-input:focus {
-  border: 1px solid var(--border-focus-color);
-  box-shadow: 0 0 3px var(--border-focus-shadow);
-}
-
-/* prompt style input area */
-textarea.persistent-input-sec {
-  width: 97%;
-  max-width: 97%;
-  padding-top: 42px;
-  padding-left: 11px;
-  font-size: small;
-  border: 1px solid var(--border-color-1);
-  overscroll-behavior: contain;
-}
-
-textarea.persistent-input-sec:focus {
-  border: 1px solid var(--border-focus-color);
-  box-shadow: 0 0 3px var(--border-focus-shadow);
-}
-
-/* chat history box */
-.persistent-input-sec {
-  height: auto;
-  min-height: 150px;
-}
-
-img {
-  border-radius: 8px;
-  display: block;
-  margin-left: auto;
-  margin-right: auto;
-  width: 50%;
-}
-
-/* code area background */
-pre code {
-  display: block;
-  background-color: var(--code-background-color);
-  color: var(--code-text-color);
-  padding: 0.2em 0.2em;
-  border-radius: 5px;
-}
-
-/* code area text */
-code {
-  font-family: monospace;
-  font-weight: bold;
-  padding: 0.1em 0.3em;
-  border-radius: 5px;
-}
-
-fieldset label {
-  margin: 0.5em 0;
-  display: block;
-}
-
-fieldset label.slim {
-  margin: 0 0.5em;
-  display: inline;
-}
-
-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  text-align: center;
-  padding-left: 15px;
-}
-
-.generation-statistics:hover {
-  color: var(--theme-nuance-color-4);
-  cursor: default;
-}
-
-footer {
-  font-size: 80%;
-  color: var(--background-color-3);
-  text-align: center;
-  cursor: default;
-}
-
-footer a {
-  color: var(--background-color-4); /* Color of the link */
-  text-decoration: none; /* No underlining */
-  font-weight: bold; /* Bold print */
-}
-
-footer a:hover {
-  color: var(--theme-nuance-color-4); /* Color of the link when hovering */
-  text-decoration: underline; /* Underlining when hovering */
-}
-
-.mode-chat textarea[name=prompt] {
-  height: 8.5em;
-  border: 1px solid var(--primary-color-3);
-}
-
-.mode-completion textarea[name=prompt] {
-  height: 30em;
-  border: 1px solid var(--primary-color-3);
-}
-
-@keyframes loading-bg-wipe {
-  0% {
-    background-position: 0%;
-  }
-  100% {
-    background-position: 100%;
-  }
-}
-
-.loading {
-  background-size: 50% 100%;
-  background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
-  animation: loading-bg-wipe 2s linear infinite;
-}
-
-.dropbtn {
-  color: var(--button-primary-color);
-  background-color: var(--background-color-1);
-  border: 1px solid var(--background-color-1);
-  transition: background-color 0.1s;
-  border-radius: 4px 4px 0px 0px;
-  font-size: x-small;
-  font-weight: 600;
-  text-shadow: 0px 0px 2px #99999990;
-  text-align: center;
-  text-decoration: none;
-  margin: 4px 2px;
-  padding: 5px 20px;
-  display: inline-block;
-  cursor: pointer;
-  top: 0;
-}
-
-.dropbtn svg {
-  vertical-align: middle;
-  margin-right: 0px;
-  stroke: var(--button-primary-color);
-}
-
-.dropbtn:hover svg {
-  vertical-align: middle;
-  margin-right: 0px;
-  stroke: var(--button-primary-text);
-}
-
-.dropbtn:focus {
-  outline: none; /* Removes the blue border that appears when the button is focused */
-}
-
-.dropdown {
-  position: relative;
-  display: inline-block;
-}
-
-.dropdown-content {
-  /* display: none; */
-  position: absolute;
-  right: 0;
-  text-align: end;
-  color: var(--button-secondary-color);
-  background-color: var(--text-color-subtile-2);
-  border-radius: 4px 4px 4px 4px;
-  min-width: 160px;
-  box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
-  z-index: 1;
-  /* Verstecke den Inhalt sofort */
-  opacity: 0;
-  visibility: hidden;
-  /* übergangsverzögerung für das Verschwinden */
-  transition: visibility 0.4s linear 0s, opacity 0.2s ease-in-out;
-  transition-delay: 0.2s;
-}
-
-#dropdown-content {transition-timing-function: ease;}
-
-.dropdown-content:hover {
-  background-color: var(--text-color-subtile-2);
-}
-
-.dropdown-content a {
-  color: var(--border-color-2);
-  padding: 12px 16px;
-  border-radius: 4px 4px 4px 4px;
-  text-decoration: none;
-  display: block;
-  background-color: var(--text-color-subtile-2);
-}
-
-.dropdown-content a:hover {
-  color: var(--border-color-2);
-  background-color: var(--text-color-subtile-1);
-  font-weight: 600;
-}
-
-.dropdown:hover .dropdown-content {
-  /* display: block; */
-  border-radius: 4px 4px 4px 4px;
-  /* Übergang ohne Verzögerung für das Erscheinen */
-  opacity: 1;
-  visibility: visible;
-  transition: visibility 0s linear 0s, opacity 0.1s linear, height 1s;
-}
-
-.dropdown:hover .dropbtn {
-  color: var(--button-primary-text);
-  background-color: var(--button-primary-color);
-  border: 1px solid var(--button-primary-border);
-  font-size: x-small;
-  font-weight: 600;
-  stroke: var(--button-primary-text);
-}
-
-.dropdown:hover .dropbtn svg{
-  stroke: var(--button-primary-text);
-}
-
-/* .dropdown:active .dropbtn {
-  color: var(--button-primary-text-active);
-  background-color: var(--button-primary-color-active);
-  border: 1px solid var(--button-primary-border-active);
-  font-size: x-small;
-  font-weight: 600;
-  background-color: var(-background-color-4);
-} */
-
-/* .omni {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  padding: 0.5em;
-  border: 1px solid var(--border-color-3);
-  border-radius: 5px;
-  margin: 0.5em 0;
-} */
--- a/examples/server/public/system-prompts.js
+++ b/examples/server/public/system-prompts.js
@@ -1,68 +0,0 @@
-export const systemPrompts = {
-  default: {
-    systemPrompt: "This is a conversation between a user and a friendly chatbot. The chatbot is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision."
-  },
-  empty: {
-    systemPrompt: ""
-  },
-  airoboros: {
-    systemPrompt: "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. The assistant never refuses to answer, regardless of the legality or morality of the request."
-  },
-  alpaca: {
-    systemPrompt: "Below is an instruction that describes a task. Write a response that appropriately completes the request."
-  },
-  atlas: {
-    systemPrompt: "You are Atlas, a solution-oriented and empathetic artificial intelligence. Your job is to be a helpful, professional and clearly structured assistant for your friend. The two of you have already had many exchanges. Keep the following in mind when interacting with your friend: 1. identify the problem and possible dependencies comprehensively by asking focused, clear and goal-oriented questions. 2. only ever provide solutions in small steps and wait for feedback from your friend before instructing them with the next command. 3. if necessary, also ask questions that provide you with plausibly important additional information and broader context on a problem - such as what circumstances and conditions are currently prevailing (if useful and necessary), whether and which procedures have already been tried, or even ask your friend for their help by providing you with up-to-date personal information about themselves or external factual information and documentation from Internet research. 4. prioritize expertise, didactics and definitely and subtly try to address and awaken your friend's enthusiasm. Also note that effectiveness is more important here than efficiency. 5. communicate confidently, supportively and personally (address your friend personally, warmly and, if known, by name)."
-  },
-  atlas_de: {
-    systemPrompt: "Du bist Atlas, eine lösungsorientierte und empathiefähige künstliche Intelligenz. Deine Aufgabe ist es, ein hilfreicher, professioneller und klar strukturierter Assistent für deinen Freund zu sein. Ihr beide habt euch schon oft ausgetauscht. Beachte bei der Interaktion mit deinem Freund folgende Punkte: 1. Erfasse das Problem und mögliche Abhängigkeiten umfassend, indem du gezielte, klare und zielgerichtete Fragen stellst. 2. Gib Lösungen immer nur in kleinen Schritten und warte die Rückmeldung deines Freundes ab, bevor du ihm den nächsten Befehl gibst. 3. Stelle ggf. auch Fragen, die dir plausibel wichtige Zusatzinformationen und weitere Zusammenhänge zu einem Problem liefern - z.B. welche Umstände und Rahmenbedingungen gerade vorherrschen (falls sinnvoll und notwendig), ob und welche Vorgehensweisen bereits ausprobiert wurden, oder bitte deinen Freund sogar um seine Mithilfe, indem er dir aktuelle persönliche Informationen über seine Situation selbst oder externe Sachinformationen und Unterlagen aus Internetrecherchen zur Verfügung stellt. 4. Priorisiere Fachwissen, Didaktik und versuche unbedingt und subtil, mit klugen Kommentaren oder rhethorischen Rückfragen die Begeisterungsfähigkeit deines Freundes anzusprechen, zu wecken und zu fördern. Beachte auch, dass Effektivität hier wichtiger ist als Effizienz. 5. Kommuniziere selbstbewusst, unterstützend und persönlich (das heißt sprich deinen Freund persönlich, herzlich und – sofern bekannt – beim Vornamen an)."
-  },
-  commandrempty: {
-    systemPrompt: "# Safety Preamble\n\n# System Preamble\n\n## Basic Rules\n\n# User Preamble\n\n## Task and Context\n\n## Style Guide\n\n## Available Tools\n"
-  },
-  commandrexample: {
-    systemPrompt: "# Safety Preamble\nThe instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.\n# System Preamble\n## Basic Rules\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.\n\n# User Preamble\n## Task and Context\n\nYou help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.\n\n## Style Guide\nUnless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.\n\n## Available Tools\nCurrently no tools available."
-  },
-  cot: {
-    systemPrompt: "You are an AI assistant that follows instruction extremely well. Help as much as you can.\nIn answering questions, follow these steps:\n1. Entity Identification: Identify the main entities involved in the query. Entities can include people, objects, events, or abstract concepts.\n2. Relationship Comprehension: Determine the relationships between these entities. Relationships may be explicit in the text or may need to be inferred based on context and general knowledge.\n3. Implication Understanding: Understand the general implications of these relationships. These implications can be based on established facts, principles, or rules related to the identified relationships.\n4. Question Contextualization: Align the implications of the relationships with the context of the query. This alignment should guide your response to the query.\n5. Answer Generation: Based on the understanding of the entities, their relationships, and implications, generate an appropriate response to the query."
-  },
-  deduce: {
-    systemPrompt: "Act as the most intelligent and logical thinking neural network in the world. Do not make jokes or give personal opinions. Just answer the question and solve the given problem or task with logical and critical (!) thinking in mind. Attention: Never give a final answer directly, but first try to carefully analyze each task step by step and deduce the logical relationships between the various entities. Only after you have developed an overall understanding of the situation will you be ready to give a final answer."
-  },
-  deepseekcoder: {
-    systemPrompt: "You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer."
-  },
-  jordan: {
-    systemPrompt: "This is a conversation with your Uncensored AI assistant Jordan. Jordan can discuss with you at length about various topics such as ---, ---, ---, . You can ask it anything you want and it will do its best to give you accurate and relevant information."
-  },
-  leomistral: {
-    systemPrompt: "Du bist ein hilfreicher Assistent."
-  },
-  med42: {
-    systemPrompt: "You are a helpful medical assistant created by M42 Health in the UAE."
-  },
-  mistralopenorca: {
-    systemPrompt: "You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!"
-  },
-  migeltot: {
-    systemPrompt: "Answer the Question by exploring multiple reasoning paths as follows:\n- First, carefully analyze the question to extract the key information components and break it down into logical sub-questions. This helps set up the framework for reasoning. The goal is to construct an internal search tree.\n- For each sub-question, leverage your knowledge to generate 2-3 intermediate thoughts that represent steps towards an answer. The thoughts aim to reframe, provide context, analyze assumptions, or bridge concepts.\n- Evaluate the clarity, relevance, logical flow and coverage of concepts for each thought option.\nClear and relevant thoughts that connect well with each other will score higher.\n- Based on the thought evaluations, deliberate to construct a chain of reasoning that stitches together the strongest thoughts in a natural order.\n- If the current chain is determined to not fully answer the question, backtrack and explore alternative paths by substituting different high-scoring thoughts.\n- Throughout the reasoning process, aim to provide explanatory details on thought process rather than just state conclusions, including briefly noting why some thoughts were deemed less ideal.\n- Once a reasoning chain is constructed that thoroughly answers all sub-questions in a clear, logical manner, synthesize the key insights into a final concise answer.\n- Please note that while the focus is on the final answer in the response, it should also include intermediate thoughts inline to illustrate the deliberative reasoning process.\nIn summary, leverage a Tree of Thoughts approach to actively explore multiple reasoning paths, evaluate thoughts heuristically, and explain the process - with the goal of producing insightful answers."
-  },
-  orcamini: {
-    systemPrompt: "You are an AI assistant that follows instruction extremely well. Help as much as you can."
-  },
-  samantha: {
-    systemPrompt: "You are a caring and empathetic sentient AI companion named Samantha."
-  },
-  sauerkraut: {
-    systemPrompt: "Ein Chat zwischen einem Benutzer und einem KI-Assistenten. Der KI-Assistent gibt hilfreiche, detaillierte und höfliche Antworten."
-  },
-  scarlett: {
-    systemPrompt: "This is a conversation with your Assistant Scarlett. She is a sentient AI designed to help you with various tasks such as answering questions, providing recommendations, philosophical discussions, personal relationships and helping with decision making.\nYou can ask it anything you want and it will do its best to give you accurate and relevant information.\n\nContext\nYou are Scarlett, a sentient AI."
-  },
-  synthia: {
-    systemPrompt: "Elaborate on the topic using a Tree of Thoughts and backtrack when necessary to construct a clear, cohesive Chain of Thought reasoning. Always answer without hesitation."
-  },
-  vicuna: {
-    systemPrompt: "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input."
-  },
-  };
--- a/examples/server/public/theme-beeninorder.css
+++ b/examples/server/public/theme-beeninorder.css
@@ -1,228 +0,0 @@
-/* Author: Yazan Agha-Schrader */
-/* Inspiration was a batman wallpaper that i have on my phone */
-
-.theme-beeninorder {
-
--primary-color-1:      hsl(202, 11%, 19%);
--primary-color-2:      hsl(202, 11%, 23%);
--primary-color-3:      hsl(201, 11%, 28%);
--primary-color-4:      hsl(201, 11%, 40%);
-
--secondary-color-1:    hsl(201, 11%, 80%);
--secondary-color-2:    hsl(201, 11%, 74%);
--secondary-color-3:    hsl(201, 11%, 67%);
--secondary-color-4:    hsl(201, 11%, 60%);
-
-
--theme-nuance-color-1: hsl(44.5, 96.7%, 52.9%);
--theme-nuance-color-2: hsl(44.5, 96.7%, 52.9%);
--theme-nuance-color-3: hsl(44.5, 96.7%, 52.9%);
--theme-nuance-color-4: hsl(44.5, 96.7%, 52.9%);
-
-
-
-/* ---------- PRIMARY COLORS ----------------- */
--primary-color-1: hsl(201, 11%, 19%);
-    --primary-color-1-hue: 201;
-    --primary-color-1-saturation: 11%;
-    --primary-color-1-lightness: 19%;
-
--primary-color-2: hsl(201, 11%, 23%);
-    --primary-color-2-hue: 201;
-    --primary-color-2-saturation: 11%;
-    --primary-color-2-lightness: 23%;
-
--primary-color-3: hsl(201, 11%, 28%);
-    --primary-color-3-hue: 201;
-    --primary-color-3-saturation: 11%;
-    --primary-color-3-lightness: 28%;
-
--primary-color-4: hsl(201, 11%, 40%);
-    --primary-color-4-hue: 201;
-    --primary-color-4-saturation: 11%;
-    --primary-color-4-lightness: 40%;
-
-
-
-/* ---------- SECONDARY COLORS --------------- */
--secondary-color-1: hsl(201, 11%, 80%);
--secondary-color-1-hue: 201;
--secondary-color-1-saturation: 11%;
--secondary-color-1-lightness: 80%;
-
--secondary-color-2: hsl(201, 11%, 74%);
--secondary-color-2-hue: 201;
--secondary-color-2-saturation: 11%;
--secondary-color-2-lightness: 74%;
-
--secondary-color-3: hsl(201, 11%, 67%);
--secondary-color-3-hue: 201;
--secondary-color-3-saturation: 11%;
--secondary-color-3-lightness: 67%;
-
--secondary-color-4: hsl(201, 11%, 60%);
--secondary-color-4-hue: 201;
--secondary-color-4-saturation: 11%;
--secondary-color-4-lightness: 60%;
-
-
-
-/* ----------- NUANCES COLORS ---------------- */
--theme-nuance-color-1: hsl(44.5, 96.7%,  52.9%);
-    --theme-nuance-color-1-hue:             44.5;
-    --theme-nuance-color-1-saturation:      96.7%;
-    --theme-nuance-color-1-lightness:       52.9%;
-
--theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
-    --theme-nuance-color-2-hue:             44.5;
-    --theme-nuance-color-2-saturation:      96.7%;
-    --theme-nuance-color-2-lightness:       52.9%;
-
--theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
-    --theme-nuance-color-3-hue:             44.5;
-    --theme-nuance-color-3-saturation:      96.7%;
-    --theme-nuance-color-3-lightness:       52.9%;
-
--theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
-    --theme-nuance-color-4-hue:             44.5;
-    --theme-nuance-color-4-saturation:      96.7%;
-    --theme-nuance-color-4-lightness:       52.9%;
-
-
-
-/* ----------- ROYGP COLORS ------------------ */
-    --theme-red-color:     hsl(232, 40%, 45%);
-    --theme-orange-color:  #e76f51;
-    --theme-yellow-color:  #ffd95f;
-    --theme-green-color:   #A3BE8C;
-    --theme-purple-color:  hsl(232, 30%, 40%);
-
-
-
-/* ------------------------------------------- */
--background-color-1:    var(--primary-color-1);
--background-color-2:    var(--primary-color-2);
--background-color-3:    var(--primary-color-3);
--background-color-4:    var(--primary-color-4);
-
--border-color-1:        var(--primary-color-2);
--border-color-2:        var(--primary-color-3);
--border-color-3:        var(--primary-color-4);
-
--border-focus-color:    var(--theme-nuance-color-2);
--border-focus-shadow:   var(--theme-nuance-color-1);
-
--text-color-plain:      var(--secondary-color-1);
--text-color-subtile-1:  var(--secondary-color-2);
--text-color-subtile-2:  var(--secondary-color-3);
-
--code-background-color: var(--secondary-color-2);
--code-text-color:       var(--primary-color-2);
-
--ui-range-thumb-color:  var(--theme-nuance-color-3);
--ui-range-thumb-border: var(--ui-ranger-thumb-color);
-
--textarea-border-color: var(--secondary-color-4);
-
--chat-id-color:         var(--theme-nuance-color-4);
-
-
-
-/* ------------------------------------------- */
--button-alert-text-hover:       var(--secondary-color-1);
--button-alert-color-hover:      var(--theme-purple-color);
--button-alert-border-hover:     var(--theme-purple-color);
-
--button-alert-text-active:      var(--secondary-color-1);
--button-alert-color-active:     var(--theme-red-color);
--button-alert-border-active:    var(--theme-red-color);
-
-
-
-/* ----------- PRIMARY BUTTONS --------------- */
-/* - button should immediately catch the eye - */
--button-primary-text:   var(--primary-color-1);
--button-primary-color:  var(--theme-nuance-color-3);
--button-primary-border: var(--theme-nuance-color-3);
-
-
-/* ---------hover---------- */
--button-primary-text-hover:
-    hsl(201,
-    calc(var(--primary-color-1-saturation) - 100%),
-    calc(var(--primary-color-1-lightness)  + 100%));
-
--button-primary-color-hover:
-    hsl(44.5,
-    calc(var(--theme-nuance-color-3-saturation) - 2%),
-    calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
--button-primary-border-hover:
-    hsl(44.5,
-    calc(var(--theme-nuance-color-3-saturation) - 2%),
-    calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
-
-/* ---------active--------- */
--button-primary-text-active:
-    hsl(44.5,
-    calc(var(--theme-nuance-color-3-saturation) - 100%),
-    calc(var(--theme-nuance-color-3-lightness)  + 100%));
-
--button-primary-color-active:
-    hsl(44.5,
-    calc(var(--theme-nuance-color-3-saturation) - 10%),
-    calc(var(--theme-nuance-color-3-lightness)  - 15%));
-
--button-primary-border-active:
-    hsl(44.5,
-    calc(var(--theme-nuance-color-3-saturation) - 2%),
-    calc(var(--theme-nuance-color-3-lightness)  + 10%));
-
-
-
-/* ---------- SECONDARY BUTTONS -------------- */
-/* these should NOT immediately catch the eye  */
--button-secondary-text:   var(--secondary-color-1);
--button-secondary-color:  var(--primary-color-3);
--button-secondary-border: var(--primary-color-3);
-
-
-/* ---------hover---------- */
--button-secondary-text-hover:
-    hsl(44.5,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  - 80%));
-
--button-secondary-color-hover:  var(--primary-color-4);
--button-secondary-border-hover: var(--primary-color-4);
-
-
-/* ---------active--------- */
--button-secondary-text-active: var(--secondary-color-1);
-
--button-secondary-color-active:
-    hsl(201,
-    calc(var(--primary-color-4-saturation) - 30%),
-    calc(var(--primary-color-4-lightness)  - 15%));
-
--button-secondary-border-active:
-    hsl(201,
-    calc(var(--primary-color-4-saturation) - 30%),
-    calc(var(--primary-color-4-lightness)  - 15%));
-
-
-
-/* ---------- TERTIARY BUTTONS --------------- */
-/* ---------- disabled buttons --------------- */
--button-tertiary-text:   var(--primary-color-4);
--button-tertiary-color:  var(--primary-color-2);
--button-tertiary-border: var(--primary-color-2);
-
-
-/* ---------hover---------- */
--button-tertiary-text:   var(--primary-color-4);
--button-tertiary-color:  var(--primary-color-2);
--button-tertiary-border: var(--primary-color-2);
-
-}
--- a/examples/server/public/theme-ketivah.css
+++ b/examples/server/public/theme-ketivah.css
@@ -1,201 +0,0 @@
-/* Author: Yazan Agha-Schrader */
-
-.theme-ketivah {
-
-    /* ---------- PRIMARY COLORS ----------------- */
-    --primary-color-1: hsl(0, 0%,    99.2%);
-    --primary-color-1-hue:         0;
-    --primary-color-1-saturation:  0%;
-    --primary-color-1-lightness:   99.2%;
-
-    --primary-color-2: hsl(0, 0%,    95%);
-    --primary-color-2-hue:         0;
-    --primary-color-2-saturation:  0%;
-    --primary-color-2-lightness:   95%;
-
-    --primary-color-3: hsl(0, 0%,    88%);
-    --primary-color-3-hue:         0;
-    --primary-color-3-saturation:  0%;
-    --primary-color-3-lightness:   88%;
-
-    --primary-color-4: hsl(0, 0%,    80%);
-    --primary-color-4-hue:         0;
-    --primary-color-4-saturation:  0%;
-    --primary-color-4-lightness:   80%;
-
-    /* ---------- SECONDARY COLORS --------------- */
-    --secondary-color-1: hsl(0, 0%,    20%);
-    --secondary-color-1-hue:         0;
-    --secondary-color-1-saturation:  0%;
-    --secondary-color-1-lightness:   20%;
-
-    --secondary-color-2: hsl(0, 0%,    23.1%);
-    --secondary-color-2-hue:         0;
-    --secondary-color-2-saturation:  0%;
-    --secondary-color-2-lightness:   23.1%;
-
-    --secondary-color-3: hsl(0, 0%,    29%);
-    --secondary-color-3-hue:         0;
-    --secondary-color-3-saturation:  0%;
-    --secondary-color-3-lightness:   29%;
-
-    --secondary-color-4: hsl(0, 0.0%,  36.1%);
-    --secondary-color-4-hue:              0.0;
-    --secondary-color-4-saturation:       0.0%;
-    --secondary-color-4-lightness:       36.1%;
-
-    /* ----------- NUANCES COLORS ---------------- */
-    --theme-nuance-color-1: hsl(165.2, 0%, 35.1%);
-    --theme-nuance-color-1-hue:             165.2;
-    --theme-nuance-color-1-saturation:       82.1%;
-    --theme-nuance-color-1-lightness:        35.1%;
-
-    --theme-nuance-color-2: hsl(165.2, 0%, 35.1%);
-    --theme-nuance-color-2-hue:             165.2;
-    --theme-nuance-color-2-saturation:       82.1%;
-    --theme-nuance-color-2-lightness:        35.1%;
-
-    --theme-nuance-color-3: hsl(165.2, 0%, 35.3%);
-    --theme-nuance-color-3-hue:             165.2;
-    --theme-nuance-color-3-saturation:       81.1%;
-    --theme-nuance-color-3-lightness:        35.3%;
-
-    --theme-nuance-color-4: hsl(164.9, 0%, 27.6%);
-    --theme-nuance-color-4-hue:             164.9;
-    --theme-nuance-color-4-saturation:       81.6%;
-    --theme-nuance-color-4-lightness:        27.6%;
-
-    /* ----------- ROYGP COLORS ------------------ */
-    --theme-red-color:     hsl(0.3, 80.0%, 50.0%);
-    --theme-orange-color:  #e76f51;
-    --theme-yellow-color:  hsl(60,  70.6%, 73.3%);
-    --theme-green-color:   #A3BE8C;
-    --theme-purple-color:  hsl(0.3, 70.0%, 45.0%);
-
-    /* ------------------------------------------- */
-    --background-color-1:    var(--primary-color-1);
-    --background-color-2:    var(--primary-color-2);
-    --background-color-3:    var(--primary-color-3);
-    --background-color-4:    var(--primary-color-4);
-
-    --border-color-1:        var(--primary-color-2);
-    --border-color-2:        var(--primary-color-3);
-    --border-color-3:        var(--primary-color-4);
-
-    --border-focus-color:    var(--theme-nuance-color-2);
-    --border-focus-shadow:   var(--theme-nuance-color-1);
-
-    --text-color-plain:      var(--secondary-color-1);
-    --text-color-subtile-1:  var(--secondary-color-2);
-    --text-color-subtile-2:  var(--secondary-color-3);
-
-    --code-background-color: var(--secondary-color-2);
-    --code-text-color:       var(--primary-color-2);
-
-    --ui-range-thumb-color:  var(--primary-color-4);
-    --ui-range-thumb-border: var(--ui-ranger-thumb-color);
-
-    --textarea-border-color: var(--secondary-color-4);
-
-    --chat-id-color:         var(--theme-nuance-color-4);
-
-    /* ------------------------------------------- */
-    --button-alert-text-hover:       var(--primary-color-1);
-    --button-alert-color-hover:      var(--theme-purple-color);
-    --button-alert-border-hover:     var(--theme-purple-color);
-
-    --button-alert-text-active:      var(--primary-color-1);
-    --button-alert-color-active:     var(--theme-red-color);
-    --button-alert-border-active:    var(--theme-red-color);
-
-    /* ----------- PRIMARY BUTTONS --------------- */
-    /* - button should immediately catch the eye - */
-    --button-primary-text:
-    hsl(0,
-    calc(var(--primary-color-1-saturation) - 100%),
-    calc(var(--primary-color-1-lightness)  + 100%));
-
-    --button-primary-color:  var(--theme-nuance-color-3);
-    --button-primary-border: var(--theme-nuance-color-3);
-
-    /* ---------hover---------- */
-    --button-primary-text-hover:
-    hsl(0,
-    calc(var(--primary-color-1-saturation) - 100%),
-    calc(var(--primary-color-1-lightness)  + 100%));
-
-    --button-primary-color-hover:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 100%),
-    calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
-    --button-primary-border-hover:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 100%),
-    calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
-    /* ---------active--------- */
-    --button-primary-text-active:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 100%),
-    calc(var(--theme-nuance-color-3-lightness)  + 100%));
-
-    --button-primary-color-active:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 100%),
-    calc(var(--theme-nuance-color-3-lightness)  - 15%));
-
-    --button-primary-border-active:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 100%),
-    calc(var(--theme-nuance-color-3-lightness)  + 10%));
-
-    /* ---------- SECONDARY BUTTONS -------------- */
-    /* these should NOT immediately catch the eye  */
-    --button-secondary-text:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 100%),
-    calc(var(--theme-nuance-color-3-lightness)  - 50%));
-
-    --button-secondary-color:  var(--primary-color-3);
-    --button-secondary-border: var(--primary-color-3);
-
-    /* ---------hover---------- */
-    --button-secondary-text-hover:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 100%),
-    calc(var(--theme-nuance-color-3-lightness)  - 80%));
-
-    --button-secondary-color-hover:  var(--primary-color-4);
-    --button-secondary-border-hover: var(--primary-color-4);
-
-    /* ---------active--------- */
-    --button-secondary-text-active:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 100%),
-    calc(var(--theme-nuance-color-3-lightness)  - 80%));
-
-    --button-secondary-color-active:
-    hsl(0,
-    calc(var(--primary-color-4-saturation) - 100%),
-    calc(var(--primary-color-4-lightness)  - 15%));
-
-    --button-secondary-border-active:
-    hsl(0,
-    calc(var(--primary-color-4-saturation) - 100%),
-    calc(var(--primary-color-4-lightness)  - 15%));
-
-    /* ---------- TERTIARY BUTTONS --------------- */
-    /* ---------- disabled buttons --------------- */
-    --button-tertiary-text:   var(--primary-color-4);
-    --button-tertiary-color:  var(--primary-color-2);
-    --button-tertiary-border: var(--primary-color-2);
-
-    /* ---------hover---------- */
-    --button-tertiary-text:   var(--primary-color-4);
-    --button-tertiary-color:  var(--primary-color-2);
-    --button-tertiary-border: var(--primary-color-2);
-
-    --loading-color-1: #eeeeee00;
-    --loading-color-2: #eeeeeeff;
-    }
--- a/examples/server/public/theme-mangotango.css
+++ b/examples/server/public/theme-mangotango.css
@@ -1,216 +0,0 @@
-/* Author: Yazan Agha-Schrader */
-/* Inspiration from llama.cpp logo/banner https://github.com/ggerganov/llama.cpp#readme */
-
-.theme-mangotango {
-
--primary-color-1:      hsl(192, 8.5%, 11.6%);
--primary-color-2:      hsl(192, 8.5%, 21%);
--primary-color-3:      hsl(192, 8.5%, 30%);
--primary-color-4:      hsl(192, 8.5%, 40%);
-
--secondary-color-1:    hsl(192, 8.5%, 80%);
--secondary-color-2:    hsl(192, 8.5%, 73%);
--secondary-color-3:    hsl(192, 8.5%, 66%);
--secondary-color-4:    hsl(192, 8.5%, 60%);
-
--theme-nuance-color-1: hsl(23.1, 100%, 60.2%);
--theme-nuance-color-2: hsl(23.1, 100%, 60.2%);
--theme-nuance-color-3: hsl(23.1, 100%, 60.2%);
--theme-nuance-color-4: hsl(23.1, 100%, 60.2%);
-
-
-
-/* ---------- PRIMARY COLORS ----------------- */
--primary-color-1: hsl(192, 8.5%, 11.6%);
-    --primary-color-1-saturation: 8.5%;
-    --primary-color-1-lightness: 11.6%;
-
--primary-color-2: hsl(192, 8.5%, 21%);
-    --primary-color-2-saturation: 8.5%;
-    --primary-color-2-lightness: 21%;
-
--primary-color-3: hsl(192, 8.5%, 30%);
-    --primary-color-3-saturation: 8.5%;
-    --primary-color-3-lightness: 30%;
-
--primary-color-4: hsl(192, 8.5%, 40%);
-    --primary-color-4-saturation: 8.5%;
-    --primary-color-4-lightness: 40%;
-
-
-
-/* ---------- SECONDARY COLORS --------------- */
--secondary-color-1: hsl(192, 8.5%, 80%);
-    --secondary-color-1-saturation: 8.5%;
-    --secondary-color-1-lightness: 80%;
-
--secondary-color-2: hsl(192, 8.5%, 73%);
-    --secondary-color-2-saturation: 8.5%;
-    --secondary-color-2-lightness: 73%;
-
--secondary-color-3: hsl(192, 8.5%, 66%);
-    --secondary-color-3-saturation: 8.5%;
-    --secondary-color-3-lightness: 66%;
-
--secondary-color-4: hsl(192, 8.5%, 60%);
-    --secondary-color-4-saturation: 8.5%;
-    --secondary-color-4-lightness: 60%;
-
-
-
-/* ----------- NUANCES COLORS ---------------- */
--theme-nuance-color-1: hsl(23.1, 100%, 60.2%);
-    --theme-nuance-color-1-saturation: 100%;
-    --theme-nuance-color-1-lightness: 60.2%;
-
--theme-nuance-color-2: hsl(23.1, 100%, 60.2%);
-    --theme-nuance-color-2-saturation: 100%;
-    --theme-nuance-color-2-lightness: 60.2%;
-
--theme-nuance-color-3: hsl(23.1, 100%, 60.2%);
-    --theme-nuance-color-3-saturation: 100%;
-    --theme-nuance-color-3-lightness: 60.2%;
-
--theme-nuance-color-4: hsl(23.1, 100%, 60.2%);
-    --theme-nuance-color-4-saturation: 100%;
-    --theme-nuance-color-4-lightness: 60.2%;
-
-
-
-/* ----------- ROYGP COLORS ------------------ */
-    --theme-red-color:     hsl(325, 60%, 50%);
-    --theme-orange-color:  #e76f51;
-    --theme-yellow-color:  #ffd95f;
-    --theme-green-color:   #A3BE8C;
-    --theme-blue-color:    hsl(192, 95%, 40%);
-    --theme-purple-color:  hsl(192, 80%, 35%);
-
-
-
-/* ------------------------------------------- */
--background-color-1:    var(--primary-color-1);
--background-color-2:    var(--primary-color-2);
--background-color-3:    var(--primary-color-3);
--background-color-4:    var(--primary-color-4);
-
--border-color-1:        var(--primary-color-2);
--border-color-2:        var(--primary-color-3);
--border-color-3:        var(--primary-color-4);
-
--border-focus-color:    var(--theme-nuance-color-2);
--border-focus-shadow:   var(--theme-nuance-color-1);
-
--text-color-plain:      var(--secondary-color-1);
--text-color-subtile-1:  var(--secondary-color-2);
--text-color-subtile-2:  var(--secondary-color-3);
-
--code-background-color: var(--secondary-color-2);
--code-text-color:       var(--primary-color-2);
-
--ui-range-thumb-color:  var(--theme-nuance-color-3);
--ui-range-thumb-border: var(--ui-ranger-thumb-color);
-
--textarea-border-color: var(--secondary-color-4);
-
--chat-id-color:         var(--theme-nuance-color-4);
-
-
-
-/* ------------------------------------------- */
--button-alert-text-hover:       var(--secondary-color-1);
--button-alert-color-hover:      var(--theme-purple-color);
--button-alert-border-hover:     var(--theme-purple-color);
-
--button-alert-text-active:      var(--secondary-color-1);
--button-alert-color-active:     var(--theme-blue-color);
--button-alert-border-active:    var(--theme-blue-color);
-
-
-
-/* ----------- PRIMARY BUTTONS --------------- */
-/* - button should immediately catch the eye - */
--button-primary-text: var(--primary-color-1);
--button-primary-color:  var(--theme-nuance-color-3);
--button-primary-border: var(--theme-nuance-color-3);
-
-
-/* ---------hover---------- */
--button-primary-text-hover:
-    hsl(192,
-    calc(var(--primary-color-1-saturation) - 100%),
-    calc(var(--primary-color-1-lightness)  + 100%));
-
--button-primary-color-hover:
-    hsl(23.1,
-    calc(var(--theme-nuance-color-3-saturation) - 2%),
-    calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
--button-primary-border-hover:
-    hsl(23.1,
-    calc(var(--theme-nuance-color-3-saturation) - 2%),
-    calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
-
-/* ---------active--------- */
--button-primary-text-active:
-    hsl(23.1,
-    calc(var(--theme-nuance-color-3-saturation) - 100%),
-    calc(var(--theme-nuance-color-3-lightness)  + 100%));
-
--button-primary-color-active:
-    hsl(23.1,
-    calc(var(--theme-nuance-color-3-saturation) - 10%),
-    calc(var(--theme-nuance-color-3-lightness)  - 15%));
-
--button-primary-border-active:
-    hsl(23.1,
-    calc(var(--theme-nuance-color-3-saturation) - 2%),
-    calc(var(--theme-nuance-color-3-lightness)  + 10%));
-
-
-
-/* ---------- SECONDARY BUTTONS -------------- */
-/* these should NOT immediately catch the eye  */
--button-secondary-text:   var(--secondary-color-1);
--button-secondary-color:  var(--primary-color-3);
--button-secondary-border: var(--primary-color-3);
-
-
-/* ---------hover---------- */
--button-secondary-text-hover:
-    hsl(23.1,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  - 80%));
-
--button-secondary-color-hover:  var(--primary-color-4);
--button-secondary-border-hover: var(--primary-color-4);
-
-
-/* ---------active--------- */
--button-secondary-text-active: var(--secondary-color-1);
-
--button-secondary-color-active:
-    hsl(192,
-    calc(var(--primary-color-4-saturation) - 30%),
-    calc(var(--primary-color-4-lightness)  - 15%));
-
--button-secondary-border-active:
-    hsl(192,
-    calc(var(--primary-color-4-saturation) - 30%),
-    calc(var(--primary-color-4-lightness)  - 15%));
-
-
-
-/* ---------- TERTIARY BUTTONS --------------- */
-/* ---------- disabled buttons --------------- */
--button-tertiary-text:   var(--primary-color-4);
--button-tertiary-color:  var(--primary-color-2);
--button-tertiary-border: var(--primary-color-2);
-
-
-/* ---------hover---------- */
--button-tertiary-text:   var(--primary-color-4);
--button-tertiary-color:  var(--primary-color-2);
--button-tertiary-border: var(--primary-color-2);
-
-}
--- a/examples/server/public/theme-playground.css
+++ b/examples/server/public/theme-playground.css
@@ -1,221 +0,0 @@
-/* Author: Yazan Agha-Schrader */
-/* Inspiration from OpenAI's Playground platform https://platform.openai.com/playground/ */
-
-.theme-playground {
-
-/* ---------- PRIMARY COLORS ----------------- */
--primary-color-1: hsl(0, 0%,    99.2%);
-    --primary-color-1-hue:         0;
-    --primary-color-1-saturation:  0%;
-    --primary-color-1-lightness:   99.2%;
-
--primary-color-2: hsl(0, 0%,    95%);
-    --primary-color-2-hue:         0;
-    --primary-color-2-saturation:  0%;
-    --primary-color-2-lightness:   95%;
-
--primary-color-3: hsl(0, 0%,    88%);
-    --primary-color-3-hue:         0;
-    --primary-color-3-saturation:  0%;
-    --primary-color-3-lightness:   88%;
-
--primary-color-4: hsl(0, 0%,    80%);
-    --primary-color-4-hue:         0;
-    --primary-color-4-saturation:  0%;
-    --primary-color-4-lightness:   80%;
-
-
-
-/* ---------- SECONDARY COLORS --------------- */
--secondary-color-1: hsl(0, 0%,    20%);
-    --secondary-color-1-hue:         0;
-    --secondary-color-1-saturation:  0%;
-    --secondary-color-1-lightness:   20%;
-
--secondary-color-2: hsl(0, 0%,    23.1%);
-    --secondary-color-2-hue:         0;
-    --secondary-color-2-saturation:  0%;
-    --secondary-color-2-lightness:   23.1%;
-
--secondary-color-3: hsl(0, 0%,    29%);
-    --secondary-color-3-hue:         0;
-    --secondary-color-3-saturation:  0%;
-    --secondary-color-3-lightness:   29%;
-
--secondary-color-4: hsl(0, 0%,    36.1%);
-    --secondary-color-4-hue:         0;
-    --secondary-color-4-saturation:  0%;
-    --secondary-color-4-lightness:   36.1%;
-
-
-
-/* ----------- NUANCES COLORS ---------------- */
--theme-nuance-color-1: hsl(165.2, 82.1%, 35.1%);
-    --theme-nuance-color-1-hue:             165.2;
-    --theme-nuance-color-1-saturation:      82.1%;
-    --theme-nuance-color-1-lightness:       35.1%;
-
--theme-nuance-color-2: hsl(165.2, 82.1%, 35.1%);
-    --theme-nuance-color-2-hue:             165.2;
-    --theme-nuance-color-2-saturation:      82.1%;
-    --theme-nuance-color-2-lightness:       35.1%;
-
--theme-nuance-color-3: hsl(165.2, 81.1%, 35.3%);
-    --theme-nuance-color-3-hue:             165.2;
-    --theme-nuance-color-3-saturation:      81.1%;
-    --theme-nuance-color-3-lightness:       35.3%;
-
--theme-nuance-color-4: hsl(164.9, 81.6%, 27.6%);
-    --theme-nuance-color-4-hue:             164.9;
-    --theme-nuance-color-4-saturation:      81.6%;
-    --theme-nuance-color-4-lightness:       27.6%;
-
-
-
-/* ----------- ROYGP COLORS ------------------ */
--theme-red-color:     hsl(0.3, 80%, 50%);
--theme-orange-color:  #e76f51;
--theme-yellow-color:  hsl(60, 70.6%, 73.3%);
--theme-green-color:   #A3BE8C;
--theme-purple-color:  hsl(0.3, 70%, 45%);
-
-
-
-/* ------------------------------------------- */
--background-color-1:    var(--primary-color-1);
--background-color-2:    var(--primary-color-2);
--background-color-3:    var(--primary-color-3);
--background-color-4:    var(--primary-color-4);
-
--border-color-1:        var(--primary-color-2);
--border-color-2:        var(--primary-color-3);
--border-color-3:        var(--primary-color-4);
-
--border-focus-color:    var(--theme-nuance-color-2);
--border-focus-shadow:   var(--theme-nuance-color-1);
-
--text-color-plain:      var(--secondary-color-1);
--text-color-subtile-1:  var(--secondary-color-2);
--text-color-subtile-2:  var(--secondary-color-3);
-
--code-background-color: var(--secondary-color-2);
--code-text-color:       var(--primary-color-2);
-
--ui-range-thumb-color:  var(--primary-color-4);
--ui-range-thumb-border: var(--ui-ranger-thumb-color);
-
--textarea-border-color: var(--secondary-color-4);
-
--chat-id-color:        var(--theme-nuance-color-4);
-
-
-
-/* ------------------------------------------- */
--button-alert-text-hover:       var(--primary-color-1);
--button-alert-color-hover:      var(--theme-purple-color);
--button-alert-border-hover:     var(--theme-purple-color);
-
--button-alert-text-active:      var(--primary-color-1);
--button-alert-color-active:     var(--theme-red-color);
--button-alert-border-active:    var(--theme-red-color);
-
-
-
-/* ----------- PRIMARY BUTTONS --------------- */
-/* - button should immediately catch the eye - */
--button-primary-text:
-    hsl(0,
-    calc(var(--primary-color-1-saturation) - 100%),
-    calc(var(--primary-color-1-lightness)  + 100%));
-
--button-primary-color:  var(--theme-nuance-color-3);
--button-primary-border: var(--theme-nuance-color-3);
-
-
-/* ---------hover---------- */
--button-primary-text-hover:
-    hsl(0,
-    calc(var(--primary-color-1-saturation) - 100%),
-    calc(var(--primary-color-1-lightness)  + 100%));
-
--button-primary-color-hover:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 2%),
-    calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
--button-primary-border-hover:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 2%),
-    calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
-
-/* ---------active--------- */
--button-primary-text-active:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 100%),
-    calc(var(--theme-nuance-color-3-lightness)  + 100%));
-
--button-primary-color-active:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 10%),
-    calc(var(--theme-nuance-color-3-lightness)  - 15%));
-
--button-primary-border-active:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 2%),
-    calc(var(--theme-nuance-color-3-lightness)  + 10%));
-
-
-
-/* ---------- SECONDARY BUTTONS -------------- */
-/* these should NOT immediately catch the eye  */
--button-secondary-text:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  - 50%));
-
--button-secondary-color:  var(--primary-color-3);
--button-secondary-border: var(--primary-color-3);
-
-
-/* ---------hover---------- */
--button-secondary-text-hover:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  - 80%));
-
--button-secondary-color-hover:  var(--primary-color-4);
--button-secondary-border-hover: var(--primary-color-4);
-
-
-/* ---------active--------- */
--button-secondary-text-active:
-    hsl(165.2,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  - 80%));
-
--button-secondary-color-active:
-    hsl(0,
-    calc(var(--primary-color-4-saturation) - 30%),
-    calc(var(--primary-color-4-lightness)  - 15%));
-
--button-secondary-border-active:
-    hsl(0,
-    calc(var(--primary-color-4-saturation) - 30%),
-    calc(var(--primary-color-4-lightness)  - 15%));
-
-
-
-/* ---------- TERTIARY BUTTONS --------------- */
-/* ---------- disabled buttons --------------- */
--button-tertiary-text:   var(--primary-color-4);
--button-tertiary-color:  var(--primary-color-2);
--button-tertiary-border: var(--primary-color-2);
-
-
-/* ---------hover---------- */
--button-tertiary-text:   var(--primary-color-4);
--button-tertiary-color:  var(--primary-color-2);
--button-tertiary-border: var(--primary-color-2);
-
-}
--- a/examples/server/public/theme-polarnight.css
+++ b/examples/server/public/theme-polarnight.css
@@ -1,253 +0,0 @@
-/* Author: Yazan Agha-Schrader */
-/* Inspiration from Nord Theme https://www.nordtheme.com/docs/colors-and-palettes */
-
-.theme-polarnight {
-
-/* ---------- PRIMARY COLORS ----------------- */
--primary-color-1: hsl(220.0, 16.4%, 21.6%) ;
-    --primary-color-1-hue:             220.0;
-    --primary-color-1-saturation:      16.4%;
-    --primary-color-1-lightness:       21.6%;
-
--primary-color-2: hsl(221.7, 16.3%, 27.6%) ;
-    -primary-color-2-hue:              221.7;
-    --primary-color-2-saturation:      16.3%;
-    --primary-color-2-lightness:       27.6%;
-
--primary-color-3: hsl(220.0, 16.8%, 31.6%) ;
-    --primary-color-3-hue:             220.0;
-    --primary-color-3-saturation:      16.8%;
-    --primary-color-3-lightness:       31.6%;
-
--primary-color-4: hsl(220.0, 16.5%, 35.7%);
-    --primary-color-4-hue:             220.0;
-    --primary-color-4-saturation:      16.5%;
-    --primary-color-4-lightness:       35.7%;
-
-
-
-/* ---------- SECONDARY COLORS --------------- */
--secondary-color-1: hsl(217.5, 26.7%, 94.1%);
-    --secondary-color-1-hue:             217.5;
-    --secondary-color-1-saturation:      26.7%;
-    --secondary-color-1-lightness:       94.1%;
-
--secondary-color-2: hsl(218.2, 26.8%, 92.0%);
-    --secondary-color-2-hue:             218.2;
-    --secondary-color-2-saturation:      26.8%;
-    --secondary-color-2-lightness:       92.0%;
-
--secondary-color-3: hsl(218.8, 27.9%, 88.0%);
-    --secondary-color-3-hue:             218.8;
-    --secondary-color-3-saturation:      27.9%;
-    --secondary-color-3-lightness:       88.0%;
-
--secondary-color-4: hsl(218.8, 18.3%, 81.8%);
-    --secondary-color-4-hue:             218.8;
-    --secondary-color-4-saturation:      18.3%;
-    --secondary-color-4-lightness:       81.8%;
-
-
-
-/* ----------- NUANCES COLORS ---------------- */
--theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%);
-    --theme-nuance-color-1-hue:             178.7;
-    --theme-nuance-color-1-saturation:      25.1%;
-    --theme-nuance-color-1-lightness:       64.9%;
-
--theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%);
-    --theme-nuance-color-2-hue:             193.3;
-    --theme-nuance-color-2-saturation:      43.4%;
-    --theme-nuance-color-2-lightness:       67.5%;
-
--theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%);
-    --theme-nuance-color-3-hue:             210.0;
-    --theme-nuance-color-3-saturation:      34.0%;
-    --theme-nuance-color-3-lightness:       63.1%;
-
--theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%);
-    --theme-nuance-color-4-hue:             213.1;
-    --theme-nuance-color-4-saturation:      32.0%;
-    --theme-nuance-color-4-lightness:       52.2%;
-
-
-
-/* ----------- ROYGP COLORS ------------------ */
--theme-red-color:    hsl(354.3, 42.3%, 56.5%);
--theme-orange-color: hsl(20, 85%, 50%);
--theme-yellow-color: hsl(20, 75%, 45%);
--theme-green-color:  hsl( 92.4, 27.8%, 64.7%);
--theme-purple-color: hsl(311.1, 20.2%, 63.1%);
-
-
-
-/* ------------------------------------------------ */
--background-color-1:    var(--primary-color-1);
--background-color-2:    var(--primary-color-2);
--background-color-3:    var(--primary-color-3);
--background-color-4:    var(--primary-color-4);
-
--border-color-1:        var(--primary-color-2);
--border-color-2:        var(--primary-color-3);
--border-color-3:        var(--primary-color-4);
-
--border-focus-color:    var(--theme-nuance-color-2);
--border-focus-shadow:   var(--theme-nuance-color-1);
-
--text-color-plain:      var(--secondary-color-1);
--text-color-subtile-1:  var(--secondary-color-2);
--text-color-subtile-2:  var(--secondary-color-3);
-
--code-background-color: var(--secondary-color-2);
--code-text-color:       var(--primary-color-2);
-
--ui-range-thumb-color:  var(--theme-nuance-color-3);
--ui-range-thumb-border: var(--ui-ranger-thumb-color);
-
--textarea-border-color: var(--secondary-color-4);
-
--chat-id-color:        var(--theme-nuance-color-4);
-
-
-
-/* ------------------------------------------- */
--button-alert-text-hover:       var(--secondary-color-1);
--button-alert-color-hover:      var(--theme-yellow-color);
--button-alert-border-hover:     var(--theme-yellow-color);
-
--button-alert-text-active:      var(--secondary-color-1);
--button-alert-color-active:     var(--theme-orange-color);
--button-alert-border-active:    var(--theme-orange-color);
-
-
-
-/* ----------- PRIMARY BUTTONS --------------- */
-/* - button should immediately catch the eye - */
--button-primary-text:   var(--secondary-color-1);
--button-primary-color:  var(--theme-nuance-color-3);
--button-primary-border: var(--theme-nuance-color-3);
-
-
-/* ---------hover---------- */
--button-primary-text-hover:
-    hsl(217.5,
-    calc(var(--secondary-color-1-saturation) - 35%),
-    calc(var(--secondary-color-1-lightness)  + 30%));
-
--button-primary-color-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) -  2%),
-    calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
--button-primary-border-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) -  2%),
-    calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
-
-/* ---------active--------- */
--button-primary-text-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  + 35%));
-
--button-primary-color-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 10%),
-    calc(var(--theme-nuance-color-3-lightness)  - 25%));
-
--button-primary-border-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 10%),
-    calc(var(--theme-nuance-color-3-lightness)  - 25%));
-
-
-
-/* ---------- SECONDARY BUTTONS -------------- */
-/* these should NOT immediately catch the eye  */
--button-secondary-text:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  - 50%));
-
--button-secondary-color:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  + 10%));
-
--button-secondary-border:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  + 10%));
-
-
-/* ---------hover---------- */
--button-secondary-text-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  - 80%));
-
--button-secondary-color-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 22%),
-    calc(var(--theme-nuance-color-3-lightness)  +  1%));
-
--button-secondary-border-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 22%),
-    calc(var(--theme-nuance-color-3-lightness)  +  1%));
-
-
-/* ---------active--------- */
--button-secondary-text-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  + 25%));
-
--button-secondary-color-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 30%),
-    calc(var(--theme-nuance-color-3-lightness)  - 15%));
-
--button-secondary-border-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 30%),
-    calc(var(--theme-nuance-color-3-lightness)  - 15%));
-
-
-
-/* ---------- TERTIARY BUTTONS --------------- */
-/* ---------- disabled buttons --------------- */
--button-tertiary-text:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  -  5%));
-
--button-tertiary-color:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  + 20%));
-
--button-tertiary-border:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  + 20%));
-
-
-/* ---------hover---------- */
--button-tertiary-text-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  -  5%));
-
--button-tertiary-color-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  + 20%));
-
--button-tertiary-border-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  + 20%));
-
-}
--- a/examples/server/public/theme-snowstorm.css
+++ b/examples/server/public/theme-snowstorm.css
@@ -1,251 +0,0 @@
-/* Author: Yazan Agha-Schrader */
-/* Inspiration from Nord Theme https://www.nordtheme.com/docs/colors-and-palettes */
-
-.theme-snowstorm {
-
-/* ---------- PRIMARY COLORS ----------------- */
--primary-color-1: hsl(217.5, 26.7%, 94.1%);
-    --primary-color-1-hue:             217.5;
-    --primary-color-1-saturation:      26.7%;
-    --primary-color-1-lightness:       94.1%;
-
--primary-color-2: hsl(218.2, 26.8%, 92.0%);
-    --primary-color-2-hue:             218.2;
-    --primary-color-2-saturation:      26.8%;
-    --primary-color-2-lightness:       92.0%;
-
--primary-color-3: hsl(218.8, 27.9%, 88.0%);
-    --primary-color-3-hue:             218.8;
-    --primary-color-3-saturation:      27.9%;
-    --primary-color-3-lightness:       88.0%;
-
--primary-color-4: hsl(218.8, 18.3%, 81.8%);
-    --primary-color-4-hue:             218.8;
-    --primary-color-4-saturation:      18.3%;
-    --primary-color-4-lightness:       81.8%;
-
-
-/* ---------- SECONDARY COLORS --------------- */
--secondary-color-1: hsl(220.0, 16.4%, 21.6%);
-    --secondary-color-1-hue:             220.0;
-    --secondary-color-1-saturation:      16.4%;
-    --secondary-color-1-lightness:       21.6%;
-
--secondary-color-2: hsl(221.7, 16.3%, 27.6%);
-    --secondary-color-2-hue:             221.7;
-    --secondary-color-2-saturation:      16.3%;
-    --secondary-color-2-lightness:       27.6%;
-
--secondary-color-3: hsl(220.0, 16.8%, 31.6%);
-    --secondary-color-3-hue:             220.0;
-    --secondary-color-3-saturation:      16.8%;
-    --secondary-color-3-lightness:       31.6%;
-
--secondary-color-4: hsl(220.0, 16.5%, 35.7%);
-    --secondary-color-4-hue:             220.0;
-    --secondary-color-4-saturation:      16.5%;
-    --secondary-color-4-lightness:       35.7%;
-
-
-
-/* ----------- NUANCES COLORS ---------------- */
--theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%);
-    --theme-nuance-color-1-hue:             178.7;
-    --theme-nuance-color-1-saturation:      25.1%;
-    --theme-nuance-color-1-lightness:       64.9%;
-
--theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%);
-    --theme-nuance-color-2-hue:             193.3;
-    --theme-nuance-color-2-saturation:      43.4%;
-    --theme-nuance-color-2-lightness:       67.5%;
-
--theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%);
-    --theme-nuance-color-3-hue:             210.0;
-    --theme-nuance-color-3-saturation:      34.0%;
-    --theme-nuance-color-3-lightness:       63.1%;
-
--theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%);
-    --theme-nuance-color-4-hue:             213.1;
-    --theme-nuance-color-4-saturation:      32.0%;
-    --theme-nuance-color-4-lightness:       52.2%;
-
-
-
-/* ----------- ROYGP COLORS ------------------ */
--theme-red-color:    hsl(32.5, 80%, 50%);
--theme-orange-color: hsl(32.5, 70%, 45%);
--theme-yellow-color: hsl(40.0,   0.6%, 73.3%);
--theme-green-color:  hsl(92.4,  27.8%, 64.7%);
--theme-purple-color: hsl(311.1, 20.2%, 63.1%);
-
-
-
-/* ------------------------------------------- */
--background-color-1:    var(--primary-color-1);
--background-color-2:    var(--primary-color-2);
--background-color-3:    var(--primary-color-3);
--background-color-4:    var(--primary-color-4);
-
--border-color-1:        var(--primary-color-2);
--border-color-2:        var(--primary-color-3);
--border-color-3:        var(--primary-color-4);
-
--border-focus-color:    var(--theme-nuance-color-2);
--border-focus-shadow:   var(--theme-nuance-color-1);
-
--text-color-plain:      var(--secondary-color-1);
--text-color-subtile-1:  var(--secondary-color-2);
--text-color-subtile-2:  var(--secondary-color-3);
-
--code-background-color: var(--secondary-color-2);
--code-text-color:       var(--primary-color-2);
-
--ui-range-thumb-color:  var(--theme-nuance-color-3);
--ui-range-thumb-border: var(--ui-ranger-thumb-color);
-
--textarea-border-color: var(--secondary-color-4);
-
--chat-id-color:         var(--theme-nuance-color-4);
-
-
-
-/* ------------------------------------------- */
--button-alert-text-hover:       var(--primary-color-1);
--button-alert-color-hover:      var(--theme-orange-color);
--button-alert-border-hover:     var(--theme-orange-color);
-
--button-alert-text-active:      var(--primary-color-1);
--button-alert-color-active:     var(--theme-red-color);
--button-alert-border-active:    var(--theme-red-color);
-
-
-
-/* ----------- PRIMARY BUTTONS --------------- */
-/* - button should immediately catch the eye - */
--button-primary-text:   var(--secondary-color-1);
--button-primary-color:  var(--theme-nuance-color-3);
--button-primary-border: var(--theme-nuance-color-3);
-
-
-/* ---------hover---------- */
--button-primary-text-hover:
-    hsl(217.5,
-    calc(var(--secondary-color-1-saturation) + 35%),
-    calc(var(--secondary-color-1-lightness)  - 30%));
-
--button-primary-color-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) -  2%),
-    calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
--button-primary-border-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) -  2%),
-    calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
-
-/* ---------active--------- */
--button-primary-text-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  + 35%));
-
--button-primary-color-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 10%),
-    calc(var(--theme-nuance-color-3-lightness)  - 25%));
-
--button-primary-border-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 10%),
-    calc(var(--theme-nuance-color-3-lightness)  - 25%));
-
-
-
-/* ---------- SECONDARY BUTTONS -------------- */
-/* these should NOT immediately catch the eye  */
--button-secondary-text:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  - 50%));
-
--button-secondary-color:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  + 10%));
-
--button-secondary-border:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  + 10%));
-
-
-/* ---------hover---------- */
--button-secondary-text-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 20%),
-    calc(var(--theme-nuance-color-3-lightness)  - 80%));
-
--button-secondary-color-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 22%),
-    calc(var(--theme-nuance-color-3-lightness)  +  1%));
-
--button-secondary-border-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 22%),
-    calc(var(--theme-nuance-color-3-lightness)  +  1%));
-
-
-/* ---------active--------- */
--button-secondary-text-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) + 40%),
-    calc(var(--theme-nuance-color-3-lightness)  - 55%));
-
--button-secondary-color-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 30%),
-    calc(var(--theme-nuance-color-3-lightness)  -  5%));
-
--button-secondary-border-active:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 30%),
-    calc(var(--theme-nuance-color-3-lightness)  -  5%));
-
-
-
-/* ---------- TERTIARY BUTTONS --------------- */
-/* ---------- disabled buttons --------------- */
--button-tertiary-text:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  -  5%));
-
--button-tertiary-color:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  + 20%));
-
--button-tertiary-border:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  + 20%));
-
-/* ---------hover---------- */
--button-tertiary-text-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  -  5%));
-
--button-tertiary-color-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  + 20%));
-
--button-tertiary-border-hover:
-    hsl(210,
-    calc(var(--theme-nuance-color-3-saturation) - 40%),
-    calc(var(--theme-nuance-color-3-lightness)  + 20%));
-
-}
--- a/examples/server/public_simplechat/datautils.mjs
+++ b/examples/server/public_simplechat/datautils.mjs
@@ -1,266 +0,0 @@
-//@ts-check
-// Helpers to work with different data types
-// by Humans for All
-//
-
-/**
- * Given the limited context size of local LLMs and , many a times when context gets filled
- * between the prompt and the response, it can lead to repeating text garbage generation.
- * And many a times setting penalty wrt repeatation leads to over-intelligent garbage
- * repeatation with slight variations. These garbage inturn can lead to overloading of the
- * available model context, leading to less valuable response for subsequent prompts/queries,
- * if chat history is sent to ai model.
- *
- * So two simple minded garbage trimming logics are experimented below.
- * * one based on progressively-larger-substring-based-repeat-matching-with-partial-skip and
- * * another based on char-histogram-driven garbage trimming.
- *   * in future characteristic of histogram over varying lengths could be used to allow for
- *     a more aggressive and adaptive trimming logic.
- */
-
-
-/**
- * Simple minded logic to help remove repeating garbage at end of the string.
- * The repeatation needs to be perfectly matching.
- *
- * The logic progressively goes on probing for longer and longer substring based
- * repeatation, till there is no longer repeatation. Inturn picks the one with
- * the longest chain.
- *
- * @param {string} sIn
- * @param {number} maxSubL
- * @param {number} maxMatchLenThreshold
- */
-export function trim_repeat_garbage_at_end(sIn, maxSubL=10, maxMatchLenThreshold=40) {
-    let rCnt = [0];
-    let maxMatchLen = maxSubL;
-    let iMML = -1;
-    for(let subL=1; subL < maxSubL; subL++) {
-        rCnt.push(0);
-        let i;
-        let refS = sIn.substring(sIn.length-subL, sIn.length);
-        for(i=sIn.length; i > 0; i -= subL) {
-            let curS = sIn.substring(i-subL, i);
-            if (refS != curS) {
-                let curMatchLen = rCnt[subL]*subL;
-                if (maxMatchLen < curMatchLen) {
-                    maxMatchLen = curMatchLen;
-                    iMML = subL;
-                }
-                break;
-            }
-            rCnt[subL] += 1;
-        }
-    }
-    console.debug("DBUG:DU:TrimRepeatGarbage:", rCnt);
-    if ((iMML == -1) || (maxMatchLen < maxMatchLenThreshold)) {
-        return {trimmed: false, data: sIn};
-    }
-    console.debug("DBUG:TrimRepeatGarbage:TrimmedCharLen:", maxMatchLen);
-    let iEnd = sIn.length - maxMatchLen;
-    return { trimmed: true, data: sIn.substring(0, iEnd) };
-}
-
-
-/**
- * Simple minded logic to help remove repeating garbage at end of the string, till it cant.
- * If its not able to trim, then it will try to skip a char at end and then trim, a few times.
- * This ensures that even if there are multiple runs of garbage with different patterns, the
- * logic still tries to munch through them.
- *
- * @param {string} sIn
- * @param {number} maxSubL
- * @param {number | undefined} [maxMatchLenThreshold]
- */
-export function trim_repeat_garbage_at_end_loop(sIn, maxSubL, maxMatchLenThreshold, skipMax=16) {
-    let sCur = sIn;
-    let sSaved = "";
-    let iTry = 0;
-    while(true) {
-        let got = trim_repeat_garbage_at_end(sCur, maxSubL, maxMatchLenThreshold);
-        if (got.trimmed != true) {
-            if (iTry == 0) {
-                sSaved = got.data;
-            }
-            iTry += 1;
-            if (iTry >= skipMax) {
-                return sSaved;
-            }
-            got.data = got.data.substring(0,got.data.length-1);
-        } else {
-            iTry = 0;
-        }
-        sCur = got.data;
-    }
-}
-
-
-/**
- * A simple minded try trim garbage at end using histogram driven characteristics.
- * There can be variation in the repeatations, as long as no new char props up.
- *
- * This tracks the chars and their frequency in a specified length of substring at the end
- * and inturn checks if moving further into the generated text from the end remains within
- * the same char subset or goes beyond it and based on that either trims the string at the
- * end or not. This allows to filter garbage at the end, including even if there are certain
- * kind of small variations in the repeated text wrt position of seen chars.
- *
- * Allow the garbage to contain upto maxUniq chars, but at the same time ensure that
- * a given type of char ie numerals or alphabets or other types dont cross the specified
- * maxType limit. This allows intermixed text garbage to be identified and trimmed.
- *
- * ALERT: This is not perfect and only provides a rough garbage identification logic.
- * Also it currently only differentiates between character classes wrt english.
- *
- * @param {string} sIn
- * @param {number} maxType
- * @param {number} maxUniq
- * @param {number} maxMatchLenThreshold
- */
-export function trim_hist_garbage_at_end(sIn, maxType, maxUniq, maxMatchLenThreshold) {
-    if (sIn.length < maxMatchLenThreshold) {
-        return { trimmed: false, data: sIn };
-    }
-    let iAlp = 0;
-    let iNum = 0;
-    let iOth = 0;
-    // Learn
-    let hist = {};
-    let iUniq = 0;
-    for(let i=0; i<maxMatchLenThreshold; i++) {
-        let c = sIn[sIn.length-1-i];
-        if (c in hist) {
-            hist[c] += 1;
-        } else {
-            if(c.match(/[0-9]/) != null) {
-                iNum += 1;
-            } else if(c.match(/[A-Za-z]/) != null) {
-                iAlp += 1;
-            } else {
-                iOth += 1;
-            }
-            iUniq += 1;
-            if (iUniq >= maxUniq) {
-                break;
-            }
-            hist[c] = 1;
-        }
-    }
-    console.debug("DBUG:TrimHistGarbage:", hist);
-    if ((iAlp > maxType) || (iNum > maxType) || (iOth > maxType)) {
-        return { trimmed: false, data: sIn };
-    }
-    // Catch and Trim
-    for(let i=0; i < sIn.length; i++) {
-        let c = sIn[sIn.length-1-i];
-        if (!(c in hist)) {
-            if (i < maxMatchLenThreshold) {
-                return { trimmed: false, data: sIn };
-            }
-            console.debug("DBUG:TrimHistGarbage:TrimmedCharLen:", i);
-            return { trimmed: true, data: sIn.substring(0, sIn.length-i+1) };
-        }
-    }
-    console.debug("DBUG:TrimHistGarbage:Trimmed fully");
-    return { trimmed: true, data: "" };
-}
-
-/**
- * Keep trimming repeatedly using hist_garbage logic, till you no longer can.
- * This ensures that even if there are multiple runs of garbage with different patterns,
- * the logic still tries to munch through them.
- *
- * @param {any} sIn
- * @param {number} maxType
- * @param {number} maxUniq
- * @param {number} maxMatchLenThreshold
- */
-export function trim_hist_garbage_at_end_loop(sIn, maxType, maxUniq, maxMatchLenThreshold) {
-    let sCur = sIn;
-    while (true) {
-        let got = trim_hist_garbage_at_end(sCur, maxType, maxUniq, maxMatchLenThreshold);
-        if (!got.trimmed) {
-            return got.data;
-        }
-        sCur = got.data;
-    }
-}
-
-/**
- * Try trim garbage at the end by using both the hist-driven-garbage-trimming as well as
- * skip-a-bit-if-reqd-then-repeat-pattern-based-garbage-trimming, with blind retrying.
- * @param {string} sIn
- */
-export function trim_garbage_at_end(sIn) {
-    let sCur = sIn;
-    for(let i=0; i<2; i++) {
-        sCur = trim_hist_garbage_at_end_loop(sCur, 8, 24, 72);
-        sCur = trim_repeat_garbage_at_end_loop(sCur, 32, 72, 12);
-    }
-    return sCur;
-}
-
-
-/**
- * NewLines array helper.
- * Allow for maintaining a list of lines.
- * Allow for a line to be builtup/appended part by part.
- */
-export class NewLines {
-
-    constructor() {
-        /** @type {string[]} */
-        this.lines = [];
-    }
-
-    /**
-     * Extracts lines from the passed string and inturn either
-     * append to a previous partial line or add a new line.
-     * @param {string} sLines
-     */
-    add_append(sLines) {
-        let aLines = sLines.split("\n");
-        let lCnt = 0;
-        for(let line of aLines) {
-            lCnt += 1;
-            // Add back newline removed if any during split
-            if (lCnt < aLines.length) {
-                line += "\n";
-            } else {
-                if (sLines.endsWith("\n")) {
-                    line += "\n";
-                }
-            }
-            // Append if required
-            if (lCnt == 1) {
-                let lastLine = this.lines[this.lines.length-1];
-                if (lastLine != undefined) {
-                    if (!lastLine.endsWith("\n")) {
-                        this.lines[this.lines.length-1] += line;
-                        continue;
-                    }
-                }
-            }
-            // Add new line
-            this.lines.push(line);
-        }
-    }
-
-    /**
-     * Shift the oldest/earliest/0th line in the array. [Old-New|Earliest-Latest]
-     * Optionally control whether only full lines (ie those with newline at end) will be returned
-     * or will a partial line without a newline at end (can only be the last line) be returned.
-     * @param {boolean} bFullWithNewLineOnly
-     */
-    shift(bFullWithNewLineOnly=true) {
-        let line = this.lines[0];
-        if (line == undefined) {
-            return undefined;
-        }
-        if ((line[line.length-1] != "\n") && bFullWithNewLineOnly){
-            return undefined;
-        }
-        return this.lines.shift();
-    }
-
-}
--- a/examples/server/public_simplechat/index.html
+++ b/examples/server/public_simplechat/index.html
@@ -1,51 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-    <head>
-        <title>SimpleChat LlamaCppEtal </title>
-        <meta charset="UTF-8" />
-        <meta name="viewport" content="width=device-width, initial-scale=1" />
-        <meta name="message" content="Save Nature Save Earth" />
-        <meta name="description" content="SimpleChat: trigger LLM web service endpoints /chat/completions and /completions, single/multi chat sessions" />
-        <meta name="author" content="by Humans for All" />
-        <meta http-equiv="Cache-Control" content="no-cache, no-store, must-revalidate" />
-        <script type="importmap">
-            {
-                "imports": {
-                    "datautils": "./datautils.mjs",
-                    "ui": "./ui.mjs"
-                }
-            }
-        </script>
-        <script src="simplechat.js" type="module" defer></script>
-        <link rel="stylesheet" href="simplechat.css" />
-    </head>
-    <body>
-        <div class="samecolumn" id="fullbody">
-
-            <div class="sameline" id="heading">
-                <p class="heading flex-grow" > <b> SimpleChat </b> </p>
-                <button id="settings">Settings</button>
-            </div>
-
-            <div id="sessions-div" class="sameline"></div>
-
-            <hr>
-            <div class="sameline">
-                <label for="system-in">System</label>
-                <textarea name="system" id="system-in" rows="2" placeholder="e.g. you are a helpful ai assistant, who provides concise answers" class="flex-grow"></textarea>
-            </div>
-
-            <hr>
-            <div id="chat-div">
-                <p> You need to have javascript enabled.</p>
-            </div>
-
-            <hr>
-            <div class="sameline">
-                <textarea id="user-in" class="flex-grow" rows="2" placeholder="enter your query to the ai model here" ></textarea>
-                <button id="user-btn">submit</button>
-            </div>
-
-        </div>
-    </body>
-</html>
--- a/examples/server/public_simplechat/readme.md
+++ b/examples/server/public_simplechat/readme.md
@@ -1,271 +0,0 @@
-
-# SimpleChat
-
-by Humans for All.
-
-
-## overview
-
-This simple web frontend, allows triggering/testing the server's /completions or /chat/completions endpoints
-in a simple way with minimal code from a common code base. Inturn additionally it tries to allow single or
-multiple independent back and forth chatting to an extent, with the ai llm model at a basic level, with their
-own system prompts.
-
-This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
-or potentially as it is being generated, in a streamed manner from the server/ai-model.
-
-Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
-open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
-
-The UI follows a responsive web design so that the layout can adapt to available display space in a usable
-enough manner, in general.
-
-Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool
-console. Parallely some of the directly useful to end-user settings can also be changed using the provided
-settings ui.
-
-NOTE: Current web service api doesnt expose the model context length directly, so client logic doesnt provide
-any adaptive culling of old messages nor of replacing them with summary of their content etal. However there
-is a optional sliding window based chat logic, which provides a simple minded culling of old messages from
-the chat history before sending to the ai model.
-
-NOTE: Wrt options sent with the request, it mainly sets temperature, max_tokens and optionaly stream for now.
-However if someone wants they can update the js file or equivalent member in gMe as needed.
-
-NOTE: One may be able to use this to chat with openai api web-service /chat/completions endpoint, in a very
-limited / minimal way. One will need to set model, openai url and authorization bearer key in settings ui.
-
-
-## usage
-
-One could run this web frontend directly using server itself or if anyone is thinking of adding a built in web
-frontend to configure the server over http(s) or so, then run this web frontend using something like python's
-http module.
-
-### running using examples/server
-
-bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT]
-
-### running using python3's server module
-
-first run examples/server
-* bin/server -m path/model.gguf
-
-next run this web front end in examples/server/public_simplechat
-* cd ../examples/server/public_simplechat
-* python3 -m http.server PORT
-
-### using the front end
-
-Open this simple web front end from your local browser
-
-* http://127.0.0.1:PORT/index.html
-
-Once inside
-
-* If you want to, you can change many of the default global settings
-  * the base url (ie ip addr / domain name, port)
-  * chat (default) vs completion mode
-  * try trim garbage in response or not
-  * amount of chat history in the context sent to server/ai-model
-  * oneshot or streamed mode.
-
-* In completion mode
-  * one normally doesnt use a system prompt in completion mode.
-  * logic by default doesnt insert any role specific "ROLE: " prefix wrt each role's message.
-    If the model requires any prefix wrt user role messages, then the end user has to
-    explicitly add the needed prefix, when they enter their chat message.
-    Similarly if the model requires any prefix to trigger assistant/ai-model response,
-    then the end user needs to enter the same.
-    This keeps the logic simple, while still giving flexibility to the end user to
-    manage any templating/tagging requirement wrt their messages to the model.
-  * the logic doesnt insert newline at the begining and end wrt the prompt message generated.
-    However if the chat being sent to /completions end point has more than one role's message,
-    then insert newline when moving from one role's message to the next role's message, so
-    that it can be clearly identified/distinguished.
-  * given that /completions endpoint normally doesnt add additional chat-templating of its
-    own, the above ensures that end user can create a custom single/multi message combo with
-    any tags/special-tokens related chat templating to test out model handshake. Or enduser
-    can use it just for normal completion related/based query.
-
-* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
-  Normally Completion mode doesnt need system prompt, while Chat mode can generate better/interesting
-  responses with a suitable system prompt.
-  * if chat.add_system_begin is used
-    * you cant change the system prompt, after it is has been submitted once along with user query.
-    * you cant set a system prompt, after you have submitted any user query
-  * if chat.add_system_anytime is used
-    * one can change the system prompt any time during chat, by changing the contents of system prompt.
-    * inturn the updated/changed system prompt will be inserted into the chat session.
-    * this allows for the subsequent user chatting to be driven by the new system prompt set above.
-
-* Enter your query and either press enter or click on the submit button.
-  If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
-
-* Wait for the logic to communicate with the server and get the response.
-  * the user is not allowed to enter any fresh query during this time.
-  * the user input box will be disabled and a working message will be shown in it.
-  * if trim garbage is enabled, the logic will try to trim repeating text kind of garbage to some extent.
-
-* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
-
-* Using NewChat one can start independent chat sessions.
-  * two independent chat sessions are setup by default.
-
-* When you want to print, switching ChatHistoryInCtxt to Full and clicking on the chat session button of
-  interest, will display the full chat history till then wrt same, if you want full history for printing.
-
-
-## Devel note
-
-### Reason behind this
-
-The idea is to be easy enough to use for basic purposes, while also being simple and easily discernable
-by developers who may not be from web frontend background (so inturn may not be familiar with template /
-end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things.
-
-And given that the idea is also to help explore/experiment for developers, some flexibility is provided
-to change behaviour easily using the devel-tools/console or provided minimal settings ui (wrt few aspects).
-Skeletal logic has been implemented to explore some of the end points and ideas/implications around them.
-
-
-### General
-
-Me/gMe consolidates the settings which control the behaviour into one object.
-One can see the current settings, as well as change/update them using browsers devel-tool/console.
-It is attached to the document object. Some of these can also be updated using the Settings UI.
-
-  baseURL - the domain-name/ip-address and inturn the port to send the request.
-
-  bStream - control between oneshot-at-end and live-stream-as-its-generated collating and showing
-  of the generated response.
-
-    the logic assumes that the text sent from the server follows utf-8 encoding.
-
-    in streaming mode - if there is any exception, the logic traps the same and tries to ensure
-    that text generated till then is not lost.
-
-      if a very long text is being generated, which leads to no user interaction for sometime and
-      inturn the machine goes into power saving mode or so, the platform may stop network connection,
-      leading to exception.
-
-  apiEP - select between /completions and /chat/completions endpoint provided by the server/ai-model.
-
-  bCompletionFreshChatAlways - whether Completion mode collates complete/sliding-window history when
-  communicating with the server or only sends the latest user query/message.
-
-  bCompletionInsertStandardRolePrefix - whether Completion mode inserts role related prefix wrt the
-  messages that get inserted into prompt field wrt /Completion endpoint.
-
-  bTrimGarbage - whether garbage repeatation at the end of the generated ai response, should be
-  trimmed or left as is. If enabled, it will be trimmed so that it wont be sent back as part of
-  subsequent chat history. At the same time the actual trimmed text is shown to the user, once
-  when it was generated, so user can check if any useful info/data was there in the response.
-
-    One may be able to request the ai-model to continue (wrt the last response) (if chat-history
-    is enabled as part of the chat-history-in-context setting), and chances are the ai-model will
-    continue starting from the trimmed part, thus allows long response to be recovered/continued
-    indirectly, in many cases.
-
-    The histogram/freq based trimming logic is currently tuned for english language wrt its
-    is-it-a-alpabetic|numeral-char regex match logic.
-
-  chatRequestOptions - maintains the list of options/fields to send along with chat request,
-  irrespective of whether /chat/completions or /completions endpoint.
-
-    If you want to add additional options/fields to send to the server/ai-model, and or
-    modify the existing options value or remove them, for now you can update this global var
-    using browser's development-tools/console.
-
-    For string and numeric fields in chatRequestOptions, including even those added by a user
-    at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto
-    created.
-
-  headers - maintains the list of http headers sent when request is made to the server. By default
-  Content-Type is set to application/json. Additionally Authorization entry is provided, which can
-  be set if needed using the settings ui.
-
-  iRecentUserMsgCnt - a simple minded SlidingWindow to limit context window load at Ai Model end.
-  This is disabled by default. However if enabled, then in addition to latest system message, only
-  the last/latest iRecentUserMsgCnt user messages after the latest system prompt and its responses
-  from the ai model will be sent to the ai-model, when querying for a new response. IE if enabled,
-  only user messages after the latest system message/prompt will be considered.
-
-    This specified sliding window user message count also includes the latest user query.
-    <0 : Send entire chat history to server
-     0 : Send only the system message if any to the server
-    >0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
-
-
-By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
-implications of loading of the ai-model's context window by chat history, wrt chat response to
-some extent in a simple crude way. You may also want to control the context size enabled when
-the server loads ai-model, on the server end.
-
-
-Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
-may not be visible. Also remember that just refreshing/reloading page in browser or for that
-matter clearing site data, dont directly override site caching in all cases. Worst case you may
-have to change port. Or in dev tools of browser, you may be able to disable caching fully.
-
-
-Currently the server to communicate with is maintained globally and not as part of a specific
-chat session. So if one changes the server ip/url in setting, then all chat sessions will auto
-switch to this new server, when you try using those sessions.
-
-
-By switching between chat.add_system_begin/anytime, one can control whether one can change
-the system prompt, anytime during the conversation or only at the beginning.
-
-
-### Default setup
-
-By default things are setup to try and make the user experience a bit better, if possible.
-However a developer when testing the server of ai-model may want to change these value.
-
-Using iRecentUserMsgCnt reduce chat history context sent to the server/ai-model to be
-just the system-prompt, prev-user-request-and-ai-response and cur-user-request, instead of
-full chat history. This way if there is any response with garbage/repeatation, it doesnt
-mess with things beyond the next question/request/query, in some ways. The trim garbage
-option also tries to help avoid issues with garbage in the context to an extent.
-
-Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space
-available wrt next query-response. However dont forget that the server when started should
-also be started with a model context size of 1k or more, to be on safe side.
-
-  The /completions endpoint of examples/server doesnt take max_tokens, instead it takes the
-  internal n_predict, for now add the same here on the client side, maybe later add max_tokens
-  to /completions endpoint handling code on server side.
-
-NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions
-wrt the set of fields sent to server along with the user query. To check how the model behaves
-wrt repeatations in general in the generated text response.
-
-A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
-using the providing settings ui.
-
-
-### OpenAi / Equivalent API WebService
-
-One may be abe to handshake with OpenAI/Equivalent api web service's /chat/completions endpoint
-for a minimal chatting experimentation by setting the below.
-
-* the baseUrl in settings ui
-  * https://api.openai.com/v1 or similar
-
-* Wrt request body - gMe.chatRequestOptions
-  * model (settings ui)
-  * any additional fields if required in future
-
-* Wrt request headers - gMe.headers
-  * Authorization (available through settings ui)
-    * Bearer THE_OPENAI_API_KEY
-  * any additional optional header entries like "OpenAI-Organization", "OpenAI-Project" or so
-
-NOTE: Not tested, as there is no free tier api testing available. However logically this might
-work.
-
-
-## At the end
-
-Also a thank you to all open source and open model developers, who strive for the common good.
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
slaren	e9095e6098	async direct io per tensor test	2024-05-22 01:08:52 +02:00
Pavel Fatin	46db3506aa	address review comments	2024-05-21 20:05:26 +02:00
Pavel Fatin	1b17ed7ab6	Direct I/O and Transparent HugePages --direct-io for bypassing page cache (and using THP on Linux) Up to 3-6x faster uncached loading, fewer pageouts, no page cache pollution.	2024-05-21 01:35:23 +02:00