cmake : fix typo

JSON: [key] -> .at(key), assert() -> GGML_ASSERT (#7143 )
Revert "llava : add support for moondream vision language model (#6899 )"
2026-04-23 16:37:33 +03:00 · 2024-05-08 16:24:02 -04:00 · 2024-05-08 21:53:08 +02:00 · 2024-05-08 22:14:39 +03:00 · 2024-05-08 22:12:06 +03:00 · 2024-05-08 22:08:10 +03:00
117 changed files with 5461 additions and 1449 deletions
--- a/.flake8
+++ b/.flake8
@@ -1,3 +1,17 @@
 [flake8]
 max-line-length = 125
-ignore = W503
+ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
+exclude =
+    # Do not traverse examples
+    examples,
+    # Do not include package initializers
+    __init__.py,
+    # No need to traverse our git directory
+    .git,
+    # There's no value in checking cache directories
+    __pycache__,
+    # No need to include the build path
+    build,
+    # This contains builds that we don't want to check
+    dist  # This is generated with `python build .` for package releases
+# max-complexity = 10
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -52,7 +52,19 @@ jobs:
            ftype: q4_0
            pr_comment_enabled: "true"

-    if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
+    if: |
+      inputs.gpu-series == 'Standard_NC4as_T4_v3'
+      || (
+        github.event_name == 'schedule'
+        && github.ref_name == 'master'
+        && github.repository_owner == 'ggerganov'
+      )
+      || github.event_name == 'pull_request_target'
+      || (
+        github.event_name == 'push'
+        && github.event.ref == 'refs/heads/master'
+        && github.repository_owner == 'ggerganov'
+      )
    steps:
      - name: Clone
        id: checkout
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -20,5 +20,4 @@ jobs:
      - name: flake8 Lint
        uses: py-actions/flake8@v2
        with:
-            ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
-            exclude: "examples/*,examples/*/**,*/**/__init__.py,convert-hf-to-gguf-update.py"
+            plugins: "flake8-no-print"
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,13 +3,14 @@
 exclude: prompts/.*.txt
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v3.2.0
+  rev: v4.6.0
  hooks:
  - id: trailing-whitespace
  - id: end-of-file-fixer
  - id: check-yaml
  - id: check-added-large-files
 - repo: https://github.com/PyCQA/flake8
-  rev: 6.0.0
+  rev: 7.0.0
  hooks:
  -   id: flake8
+      additional_dependencies: [flake8-no-print]
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,6 +103,8 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
 set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "llama: max. batch size for using peer access")
 option(LLAMA_CUDA_NO_PEER_COPY               "llama: do not use peer to peer copies"            OFF)
+option(LLAMA_CUDA_NO_VMM                     "llama: do not try to use CUDA VMM"                OFF)
+
 option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
@@ -409,6 +411,9 @@ if (LLAMA_CUDA)
        if (LLAMA_CUDA_FORCE_MMQ)
            add_compile_definitions(GGML_CUDA_FORCE_MMQ)
        endif()
+        if (LLAMA_CUDA_NO_VMM)
+            add_compile_definitions(GGML_CUDA_NO_VMM)
+        endif()
        add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
        add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
        if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -425,7 +430,7 @@ if (LLAMA_CUDA)

        if (LLAMA_STATIC)
            if (WIN32)
-                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
+                # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
            else ()
                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -434,7 +439,11 @@ if (LLAMA_CUDA)
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()

-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
+        if (LLAMA_CUDA_NO_VMM)
+            # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
+        else()
+            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
+        endif()

    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
        # 52 == lowest CUDA 12 standard
--- a/3
+++ b/3
@@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
 			./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
 			continue; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
--- a/README.md
+++ b/README.md
@@ -20,7 +20,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Hot topics

- **BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920**
+- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
+- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
 - MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
 - Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
 - Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
@@ -139,7 +140,6 @@ Typically finetunes of the base models below are supported as well.
 - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
 - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
 - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)

 **HTTP server**

@@ -712,6 +712,8 @@ Building the program with BLAS support may lead to some performance improvements

 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.

+Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
+
 ```bash
 # obtain the official LLaMA model weights and place them in ./models
 ls ./models
@@ -933,17 +935,25 @@ If your issue is with model generation quality, then please at least scan the fo

 ### Android

+#### Build on Android using Termux
+[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
+```
+apt update && apt upgrade -y
+apt install git make cmake
+```
+
+It's recommended to move your model inside the `~/` directory for best performance:
+```
+cd storage/downloads
+mv model.gguf ~/
+```
+
+[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
+
 #### Building the Project using Android NDK
-You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
-
-First, install the essential packages for termux:
-```
-pkg install clang wget git cmake
-```
-Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
-
-You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux.
+Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.

+Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
 ```
 $ mkdir build-android
 $ cd build-android
@@ -951,7 +961,9 @@ $ export NDK=<your_ndk_directory>
 $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
 $ make
 ```
-Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
+
+Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
+
 Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:

 (Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
@@ -973,53 +985,10 @@ $cd /data/data/com.termux/files/home/bin
 $./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
 ```

-Here is a demo of an interactive session running on Pixel 5 phone:
+Here's a demo of an interactive session running on Pixel 5 phone:

 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4

-#### Building the Project using Termux (F-Droid)
-Termux from F-Droid offers an alternative route to execute the project on an Android device. This method empowers you to construct the project right from within the terminal, negating the requirement for a rooted device or SD Card.
-
-Outlined below are the directives for installing the project using OpenBLAS and CLBlast. This combination is specifically designed to deliver peak performance on recent devices that feature a GPU.
-
-If you opt to utilize OpenBLAS, you'll need to install the corresponding package.
-```
-apt install libopenblas
-```
-
-Subsequently, if you decide to incorporate CLBlast, you'll first need to install the requisite OpenCL packages:
-```
-apt install ocl-icd opencl-headers opencl-clhpp clinfo
-```
-
-In order to compile CLBlast, you'll need to first clone the respective Git repository, which can be found at this URL: https://github.com/CNugteren/CLBlast. Alongside this, clone this repository into your home directory. Once this is done, navigate to the CLBlast folder and execute the commands detailed below:
-```
-cmake .
-make
-cp libclblast.so* $PREFIX/lib
-cp ./include/clblast.h ../llama.cpp
-```
-
-Following the previous steps, navigate to the LlamaCpp directory. To compile it with OpenBLAS and CLBlast, execute the command provided below:
-```
-cp /data/data/com.termux/files/usr/include/openblas/cblas.h .
-cp /data/data/com.termux/files/usr/include/openblas/openblas_config.h .
-make LLAMA_CLBLAST=1 //(sometimes you need to run this command twice)
-```
-
-Upon completion of the aforementioned steps, you will have successfully compiled the project. To run it using CLBlast, a slight adjustment is required: a command must be issued to direct the operations towards your device's physical GPU, rather than the virtual one. The necessary command is detailed below:
-```
-GGML_OPENCL_PLATFORM=0
-GGML_OPENCL_DEVICE=0
-export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
-```
-
-(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
-
-For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
-
-Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.
-
 ### Docker

 #### Prerequisites
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -160,9 +160,8 @@ function gg_run_test_scripts_debug {

    set -e

-    # TODO: too slow, run on dedicated node
-   #(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-   #(cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log

    set +e
 }
@@ -695,8 +694,10 @@ test $ret -eq 0 && gg_run ctest_release
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run embd_bge_small

-    test $ret -eq 0 && gg_run test_scripts_debug
-    test $ret -eq 0 && gg_run test_scripts_release
+    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
+        test $ret -eq 0 && gg_run test_scripts_debug
+        test $ret -eq 0 && gg_run test_scripts_release
+    fi

    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
        if [ -z ${GG_BUILD_CUDA} ]; then
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,4 +1,6 @@
 #include "common.h"
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
@@ -76,7 +78,7 @@ int32_t get_num_physical_cores() {
    // enumerate the set of thread siblings, num entries is num cores
    std::unordered_set<std::string> siblings;
    for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
-        std::ifstream thread_siblings("/sys/devices/system/cpu"
+        std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
            + std::to_string(cpu) + "/topology/thread_siblings");
        if (!thread_siblings.is_open()) {
            break; // no more cpus
@@ -911,6 +913,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.instruct = true;
        return true;
    }
+    if (arg == "-cnv" || arg == "--conversation") {
+        params.conversation = true;
+        return true;
+    }
    if (arg == "-cml" || arg == "--chatml") {
        params.chatml = true;
        return true;
@@ -1417,6 +1423,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --version             show version and build info\n");
    printf("  -i, --interactive     run in interactive mode\n");
    printf("  --interactive-first   run in interactive mode and wait for input right away\n");
+    printf("  -cnv, --conversation  run in conversation mode (does not print special tokens and suffix/prefix)\n");
    printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
    printf("  -cml, --chatml        run in chatml mode (use with ChatML-compatible models)\n");
    printf("  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
@@ -1964,18 +1971,18 @@ static bool llama_download_file(const std::string & url, const std::string & pat
            try {
                metadata_in >> metadata;
                fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
-                if (metadata.contains("url") && metadata["url"].is_string()) {
-                    auto previous_url = metadata["url"].get<std::string>();
+                if (metadata.contains("url") && metadata.at("url").is_string()) {
+                    auto previous_url = metadata.at("url").get<std::string>();
                    if (previous_url != url) {
                        fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
                        return false;
                    }
                }
-                if (metadata.contains("etag") && metadata["etag"].is_string()) {
-                    etag = metadata["etag"];
+                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
+                    etag = metadata.at("etag");
                }
-                if (metadata.contains("lastModified") && metadata["lastModified"].is_string()) {
-                    last_modified = metadata["lastModified"];
+                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
+                    last_modified = metadata.at("lastModified");
                }
            } catch (const nlohmann::json::exception & e) {
                fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
--- a/common/common.h
+++ b/common/common.h
@@ -140,6 +140,7 @@ struct gpt_params {
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
+    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -1,4 +1,8 @@
 #pragma once
+
+#include "ggml.h"
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"

 std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -35,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_

    result->prev.resize(params.n_prev);

+    result->n_considered = 0;
+
    llama_sampling_set_rng_seed(result, params.seed);

    return result;
@@ -64,6 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {

    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
    ctx->cur.clear();
+    ctx->n_considered = 0;
 }

 void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
@@ -253,6 +256,8 @@ static llama_token llama_sampling_sample_impl(
        }
    }

+    ctx_sampling->n_considered = cur_p.size;
+
    return id;
 }

--- a/common/sampling.h
+++ b/common/sampling.h
@@ -81,6 +81,7 @@ struct llama_sampling_context {
    // TODO: replace with ring-buffer
    std::vector<llama_token>      prev;
    std::vector<llama_token_data> cur;
+    size_t n_considered;

    std::mt19937 rng;
 };
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # This script downloads the tokenizer models of the specified models from Huggingface and
 # generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
 #
@@ -21,6 +23,7 @@
 # TODO: automate the update of convert-hf-to-gguf.py
 #

+import logging
 import os
 import requests
 import sys
@@ -28,12 +31,18 @@ import json

 from hashlib import sha256
 from enum import IntEnum, auto
+from transformers import AutoTokenizer
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger("convert-hf-to-gguf-update")
+

 class TOKENIZER_TYPE(IntEnum):
    SPM = auto()
    BPE = auto()
    WPM = auto()

+
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
 chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
@@ -41,36 +50,43 @@ chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍
 if len(sys.argv) == 2:
    token = sys.argv[1]
 else:
-    print("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
+    logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
    sys.exit(1)

 # TODO: add models here, base models preferred
 models = [
-        { "name": "llama-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
-        { "name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
-        { "name": "phi-3",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
-        { "name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
-        { "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
-        { "name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
-        { "name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
-        { "name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
-        { "name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
-        { "name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
-        ]
+    {"name": "llama-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
+    {"name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
+    {"name": "phi-3",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
+    {"name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
+    {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
+    {"name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
+    {"name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
+    {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
+    {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
+    {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
+    {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
+    {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
+    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
+    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
+    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
+]

 # make directory "models/tokenizers" if it doesn't exist
 if not os.path.exists("models/tokenizers"):
    os.makedirs("models/tokenizers")

+
 def download_file_with_auth(url, token, save_path):
    headers = {"Authorization": f"Bearer {token}"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
-        print(f"File {save_path} downloaded successfully")
+        logger.info(f"File {save_path} downloaded successfully")
    else:
-        print(f"Failed to download file. Status code: {response.status_code}")
+        logger.info(f"Failed to download file. Status code: {response.status_code}")
+

 # download the tokenizer models
 for model in models:
@@ -81,10 +97,10 @@ for model in models:
    if not os.path.exists(f"models/tokenizers/{name}"):
        os.makedirs(f"models/tokenizers/{name}")
    else:
-        print(f"Directory models/tokenizers/{name} already exists - skipping")
+        logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
        continue

-    print(f"Downloading {name} to models/tokenizers/{name}")
+    logger.info(f"Downloading {name} to models/tokenizers/{name}")

    url = f"{repo}/raw/main/config.json"
    save_path = f"models/tokenizers/{name}/config.json"
@@ -94,6 +110,14 @@ for model in models:
    save_path = f"models/tokenizers/{name}/tokenizer.json"
    download_file_with_auth(url, token, save_path)

+    # if downloaded file is less than 1KB, we likely need to download an LFS instead
+    if os.path.getsize(save_path) < 1024:
+        # remove the file
+        os.remove(save_path)
+        url = f"{repo}/resolve/main/tokenizer.json"
+        save_path = f"models/tokenizers/{name}/tokenizer.json"
+        download_file_with_auth(url, token, save_path)
+
    if tokt == TOKENIZER_TYPE.SPM:
        url = f"{repo}/resolve/main/tokenizer.model"
        save_path = f"models/tokenizers/{name}/tokenizer.model"
@@ -115,80 +139,84 @@ for model in models:
        continue

    # create the tokenizer
-    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")

    chktok = tokenizer.encode(chktxt)
    chkhsh = sha256(str(chktok).encode()).hexdigest()

-    print(f"model: {name}")
-    print(f"tokt: {tokt}")
-    print(f"repo: {model['repo']}")
-    print(f"chktok: {chktok}")
-    print(f"chkhsh: {chkhsh}")
+    logger.info(f"model: {name}")
+    logger.info(f"tokt: {tokt}")
+    logger.info(f"repo: {model['repo']}")
+    logger.info(f"chktok: {chktok}")
+    logger.info(f"chkhsh: {chkhsh}")

    # print the "pre_tokenizer" content from the tokenizer.json
    with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
        cfg = json.load(f)
+        normalizer = cfg["normalizer"]
+        logger.info("normalizer: " + json.dumps(normalizer, indent=4))
        pre_tokenizer = cfg["pre_tokenizer"]
-        print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
+        logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))

-    print(f"\n")
+    logger.info("")

    src_ifs += f"        if chkhsh == \"{chkhsh}\":\n"
    src_ifs += f"            # ref: {model['repo']}\n"
    src_ifs += f"            res = \"{name}\"\n"

-src_func = ""
-src_func +=  "    def get_vocab_base_pre(self, tokenizer) -> str:\n"
-src_func +=  "        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n"
-src_func +=  "        # is specific for the BPE pre-tokenizer used by the model\n"
-src_func +=  "        # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n"
-src_func +=  "        # use in llama.cpp to implement the same pre-tokenizer\n"
-src_func +=  "\n"
-src_func += f"        chktxt = {repr(chktxt)}\n"
-src_func +=  "\n"
-src_func +=  "        chktok = tokenizer.encode(chktxt)\n"
-src_func +=  "        chkhsh = sha256(str(chktok).encode()).hexdigest()\n"
-src_func +=  "\n"
-src_func +=  "        print(f\"chktok: {chktok}\")\n"
-src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n"
-src_func +=  "\n"
-src_func +=  "        res = None\n"
-src_func +=  "\n"
-src_func +=  "        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n"
-src_func +=  "        #       or pull the latest version of the model from Huggingface\n"
-src_func +=  "        #       don't edit the hashes manually!\n"
-src_func += f"{src_ifs}\n"
-src_func +=  "        if res is None:\n"
-src_func +=  "            print(\"\\n\")\n"
-src_func +=  "            print(\"**************************************************************************************\")\n"
-src_func +=  "            print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n"
-src_func +=  "            print(\"**          There are 2 possible reasons for this:\")\n"
-src_func +=  "            print(\"**          - the model has not been added to convert-hf-to-gguf-update.py yet\")\n"
-src_func +=  "            print(\"**          - the pre-tokenization config has changed upstream\")\n"
-src_func +=  "            print(\"**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n"
-src_func +=  "            print(\"** ref:     https://github.com/ggerganov/llama.cpp/pull/6920\")\n"
-src_func +=  "            print(\"**\")\n"
-src_func +=  "            print(f\"** chkhsh:  {chkhsh}\")\n"
-src_func +=  "            print(\"**************************************************************************************\")\n"
-src_func +=  "            print(\"\\n\")\n"
-src_func +=  "            raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n"
-src_func +=  "\n"
-src_func +=  "        print(f\"tokenizer.ggml.pre: {res}\")\n"
-src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n"
-src_func +=  "\n"
-src_func +=  "        return res\n"
+src_func = f"""
+    def get_vocab_base_pre(self, tokenizer) -> str:
+        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
+        # is specific for the BPE pre-tokenizer used by the model
+        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
+        # use in llama.cpp to implement the same pre-tokenizer

-print(src_func)
+        chktxt = {repr(chktxt)}

-print("\n")
-print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
-print("\n")
+        chktok = tokenizer.encode(chktxt)
+        chkhsh = sha256(str(chktok).encode()).hexdigest()
+
+        logger.debug(f"chktok: {{chktok}}")
+        logger.debug(f"chkhsh: {{chkhsh}}")
+
+        res = None
+
+        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
+        #       or pull the latest version of the model from Huggingface
+        #       don't edit the hashes manually!
+{src_ifs}
+        if res is None:
+            logger.warning("\\n")
+            logger.warning("**************************************************************************************")
+            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
+            logger.warning("**          There are 2 possible reasons for this:")
+            logger.warning("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
+            logger.warning("**          - the pre-tokenization config has changed upstream")
+            logger.warning("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
+            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
+            logger.warning("**")
+            logger.warning(f"** chkhsh:  {{chkhsh}}")
+            logger.warning("**************************************************************************************")
+            logger.warning("\\n")
+            raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
+
+        logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
+        logger.debug(f"chkhsh: {{chkhsh}}")
+
+        return res
+"""
+
+print(src_func) # noqa: NP100
+
+logger.info("\n")
+logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
+logger.info("\n")

 # generate tests for each tokenizer model

 tests = [
+    "ied 4 ½ months",
+    "Führer",
    "",
    " ",
    "  ",
@@ -250,7 +278,6 @@ for model in models:
    tokt = model["tokt"]

    # create the tokenizer
-    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")

    with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
@@ -265,15 +292,15 @@ for model in models:
                f.write(f" {r}")
            f.write("\n")

-    print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
+    logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")

 # generate commands for creating vocab files

-print("\nRun the following commands to generate the vocab files for testing:\n")
+logger.info("\nRun the following commands to generate the vocab files for testing:\n")

 for model in models:
    name = model["name"]

-    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
+    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100

-print("\n")
+logger.info("\n")
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -2,6 +2,7 @@

 from __future__ import annotations

+import logging
 import argparse
 import contextlib
 import json
@@ -26,6 +27,8 @@ import gguf

 from convert import LlamaHfVocab, permute

+logger = logging.getLogger("hf-to-gguf")
+

 ###### MODEL DEFINITIONS ######

@@ -76,7 +79,7 @@ class Model(ABC):

    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
        for part_name in self.part_names:
-            print(f"gguf: loading model part '{part_name}'")
+            logger.info(f"gguf: loading model part '{part_name}'")
            ctx: ContextManager[Any]
            if self.is_safetensors:
                from safetensors import safe_open
@@ -95,42 +98,42 @@ class Model(ABC):

        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
            self.gguf_writer.add_context_length(n_ctx)
-            print(f"gguf: context length = {n_ctx}")
+            logger.info(f"gguf: context length = {n_ctx}")

        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        self.gguf_writer.add_embedding_length(n_embd)
-        print(f"gguf: embedding length = {n_embd}")
+        logger.info(f"gguf: embedding length = {n_embd}")

        if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
-            print(f"gguf: feed forward length = {n_ff}")
+            logger.info(f"gguf: feed forward length = {n_ff}")

        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        self.gguf_writer.add_head_count(n_head)
-        print(f"gguf: head count = {n_head}")
+        logger.info(f"gguf: head count = {n_head}")

        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
            self.gguf_writer.add_head_count_kv(n_head_kv)
-            print(f"gguf: key-value head count = {n_head_kv}")
+            logger.info(f"gguf: key-value head count = {n_head_kv}")

        if (rope_theta := self.hparams.get("rope_theta")) is not None:
            self.gguf_writer.add_rope_freq_base(rope_theta)
-            print(f"gguf: rope theta = {rope_theta}")
+            logger.info(f"gguf: rope theta = {rope_theta}")
        if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
            self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
-            print(f"gguf: rms norm epsilon = {f_rms_eps}")
+            logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
        if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
            self.gguf_writer.add_layer_norm_eps(f_norm_eps)
-            print(f"gguf: layer norm epsilon = {f_norm_eps}")
+            logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
        if (n_experts := self.hparams.get("num_local_experts")) is not None:
            self.gguf_writer.add_expert_count(n_experts)
-            print(f"gguf: expert count = {n_experts}")
+            logger.info(f"gguf: expert count = {n_experts}")
        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
            self.gguf_writer.add_expert_used_count(n_experts_used)
-            print(f"gguf: experts used count = {n_experts_used}")
+            logger.info(f"gguf: experts used count = {n_experts_used}")

        self.gguf_writer.add_file_type(self.ftype)
-        print(f"gguf: file type = {self.ftype}")
+        logger.info(f"gguf: file type = {self.ftype}")

    def write_tensors(self):
        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@@ -151,8 +154,7 @@ class Model(ABC):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -169,7 +171,7 @@ class Model(ABC):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

@@ -274,8 +276,8 @@ class Model(ABC):
        chktok = tokenizer.encode(chktxt)
        chkhsh = sha256(str(chktok).encode()).hexdigest()

-        print(f"chktok: {chktok}")
-        print(f"chkhsh: {chkhsh}")
+        logger.debug(f"chktok: {chktok}")
+        logger.debug(f"chkhsh: {chkhsh}")

        res = None

@@ -306,24 +308,39 @@ class Model(ABC):
        if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
            # ref: https://huggingface.co/openai-community/gpt2
            res = "gpt-2"
+        if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
+            # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
+            res = "refact"
+        if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
+            # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
+            res = "command-r"
+        if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
+            # ref: https://huggingface.co/Qwen/Qwen1.5-7B
+            res = "qwen2"
+        if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
+            # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
+            res = "olmo"
+        if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
+            # ref: https://huggingface.co/databricks/dbrx-instruct
+            res = "dbrx"

        if res is None:
-            print("\n")
-            print("**************************************************************************************")
-            print("** WARNING: The BPE pre-tokenizer was not recognized!")
-            print("**          There are 2 possible reasons for this:")
-            print("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
-            print("**          - the pre-tokenization config has changed upstream")
-            print("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
-            print("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
-            print("**")
-            print(f"** chkhsh:  {chkhsh}")
-            print("**************************************************************************************")
-            print("\n")
+            logger.warning("\n")
+            logger.warning("**************************************************************************************")
+            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
+            logger.warning("**          There are 2 possible reasons for this:")
+            logger.warning("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
+            logger.warning("**          - the pre-tokenization config has changed upstream")
+            logger.warning("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
+            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
+            logger.warning("**")
+            logger.warning(f"** chkhsh:  {chkhsh}")
+            logger.warning("**************************************************************************************")
+            logger.warning("\n")
            raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")

-        print(f"tokenizer.ggml.pre: {res}")
-        print(f"chkhsh: {chkhsh}")
+        logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
+        logger.debug(f"chkhsh: {chkhsh}")

        return res

@@ -439,9 +456,7 @@ class Model(ABC):

        if vocab_size > len(tokens):
            pad_count = vocab_size - len(tokens)
-            print(
-                f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]"
-            )
+            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
            for i in range(1, pad_count + 1):
                tokens.append(f"[PAD{i}]")
                scores.append(-1000.0)
@@ -553,7 +568,7 @@ class BloomModel(Model):
                    ),
                    axis=0,
                )
-                print("re-format attention.linear_qkv.weight")
+                logger.info("re-format attention.linear_qkv.weight")
            elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
                qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
                data = np.concatenate(
@@ -564,13 +579,12 @@ class BloomModel(Model):
                    ),
                    axis=0,
                )
-                print("re-format attention.linear_qkv.bias")
+                logger.info("re-format attention.linear_qkv.bias")

            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -587,13 +601,13 @@ class BloomModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+            logger.info(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

            if not has_lm_head and name == "word_embeddings.weight":
                self.gguf_writer.add_tensor("output.weight", data)
-                print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+                logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")


@Model.register("MPTForCausalLM")
@@ -653,8 +667,7 @@ class MPTModel(Model):
            else:
                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -671,7 +684,7 @@ class MPTModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

@@ -697,8 +710,7 @@ class OrionModel(Model):
        elif "model_max_length" in self.hparams:
            ctx_length = self.hparams["model_max_length"]
        else:
-            print("gguf: can not find ctx length parameter.")
-            sys.exit()
+            raise ValueError("gguf: can not find ctx length parameter.")

        self.gguf_writer.add_file_type(self.ftype)
        self.gguf_writer.add_name(self.dir_model.name)
@@ -736,8 +748,7 @@ class OrionModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -754,7 +765,7 @@ class OrionModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)


@@ -779,8 +790,7 @@ class BaichuanModel(Model):
        elif "model_max_length" in self.hparams:
            ctx_length = self.hparams["model_max_length"]
        else:
-            print("gguf: can not find ctx length parameter.")
-            sys.exit()
+            raise ValueError("gguf: can not find ctx length parameter.")

        self.gguf_writer.add_name(self.dir_model.name)
        self.gguf_writer.add_source_hf_repo(hf_repo)
@@ -809,7 +819,7 @@ class BaichuanModel(Model):

        for i in range(block_count):
            if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
-                print(f"Unpacking and permuting layer {i}")
+                logger.info(f"Unpacking and permuting layer {i}")
                model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
                    self._reverse_hf_permute_part(w, 0, head_count, head_count)
                model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
@@ -834,8 +844,7 @@ class BaichuanModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -852,7 +861,7 @@ class BaichuanModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)

    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
@@ -937,8 +946,7 @@ class XverseModel(Model):
        elif "model_max_length" in self.hparams:
            ctx_length = self.hparams["model_max_length"]
        else:
-            print("gguf: can not find ctx length parameter.")
-            sys.exit()
+            raise ValueError("gguf: can not find ctx length parameter.")

        self.gguf_writer.add_name(self.dir_model.name)
        self.gguf_writer.add_source_hf_repo(hf_repo)
@@ -987,8 +995,7 @@ class XverseModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -1005,7 +1012,7 @@ class XverseModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)

    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
@@ -1092,8 +1099,7 @@ class FalconModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -1110,7 +1116,7 @@ class FalconModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

@@ -1197,8 +1203,7 @@ class RefactModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -1215,7 +1220,7 @@ class RefactModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

@@ -1264,10 +1269,9 @@ class PersimmonModel(Model):
            data = data_torch.to(torch.float32).squeeze().numpy()
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")
            n_dims = len(data.shape)
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)


@@ -1332,8 +1336,7 @@ class StableLMModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -1350,7 +1353,7 @@ class StableLMModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.debug(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

@@ -1366,8 +1369,7 @@ class StableLMModel(Model):
            merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
            new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")
            if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
                data = data.astype(np.float32)

@@ -1375,7 +1377,7 @@ class StableLMModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
+            logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

@@ -1427,7 +1429,7 @@ class LlamaModel(Model):
        experts = dict()
        for name, data_torch in self.get_tensors():
            # we don't need these
-            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
+            if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
                continue

            old_dtype = data_torch.dtype
@@ -1480,10 +1482,9 @@ class LlamaModel(Model):

                            new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
                            if new_name is None:
-                                print(f"Can not map tensor {name!r}")
-                                sys.exit()
+                                raise ValueError(f"Can not map tensor {name!r}")

-                            print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
+                            logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")

                            self.gguf_writer.add_tensor(new_name, data)
                continue
@@ -1491,8 +1492,7 @@ class LlamaModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -1509,7 +1509,7 @@ class LlamaModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

@@ -1584,10 +1584,9 @@ class GrokModel(Model):

                            new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
                            if new_name is None:
-                                print(f"Can not map tensor {name!r}")
-                                sys.exit()
+                                raise ValueError(f"Can not map tensor {name!r}")

-                            print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
+                            logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")

                            self.gguf_writer.add_tensor(new_name, data)
                continue
@@ -1595,8 +1594,7 @@ class GrokModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -1613,7 +1611,7 @@ class GrokModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

@@ -1646,7 +1644,7 @@ class DbrxModel(Model):
        self.gguf_writer.add_layer_norm_eps(1e-5)

        self.gguf_writer.add_file_type(self.ftype)
-        print(f"gguf: file type = {self.ftype}")
+        logger.info(f"gguf: file type = {self.ftype}")

    def write_tensors(self):
        block_count = self.hparams.get("n_layers")
@@ -1689,8 +1687,7 @@ class DbrxModel(Model):
            # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
            new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -1698,8 +1695,7 @@ class DbrxModel(Model):
            # Most of the codebase that takes in 1D tensors only handles F32 tensors
            # and most of the outputs tensors are F32.
            if data_dtype != np.float32 and n_dims == 1:
-                print(f"Can not map tensor {name!r}: all 1D tensors must be F32")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}: all 1D tensors must be F32")

            # if f32 desired, convert any float16 to float32
            if self.ftype == 0 and data_dtype == np.float16:
@@ -1709,7 +1705,7 @@ class DbrxModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+            logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

@@ -1771,8 +1767,7 @@ class MiniCPMModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -1789,7 +1784,7 @@ class MiniCPMModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

@@ -1855,8 +1850,7 @@ class QwenModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -1873,7 +1867,7 @@ class QwenModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)


@@ -1950,10 +1944,9 @@ class Qwen2MoeModel(Model):

                            new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
                            if new_name is None:
-                                print(f"Can not map tensor {name!r}")
-                                sys.exit()
+                                raise ValueError(f"Can not map tensor {name!r}")

-                            print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
+                            logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")

                            self.gguf_writer.add_tensor(new_name, data)
                continue
@@ -1961,8 +1954,7 @@ class Qwen2MoeModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -1979,7 +1971,7 @@ class Qwen2MoeModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+            logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

@@ -2024,8 +2016,7 @@ class GPT2Model(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -2042,13 +2033,13 @@ class GPT2Model(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

            # note: GPT2 output is tied to (same as) wte in original model
            if new_name == "token_embd.weight":
-                print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+                logger.info(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
                self.gguf_writer.add_tensor("output.weight", data)


@@ -2087,8 +2078,7 @@ class Phi3MiniModel(Model):
        tokenizer_path = self.dir_model / 'tokenizer.model'

        if not tokenizer_path.is_file():
-            print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
-            sys.exit(1)
+            raise ValueError(f'Error: Missing {tokenizer_path}')

        tokenizer = SentencePieceProcessor(str(tokenizer_path))

@@ -2126,7 +2116,7 @@ class Phi3MiniModel(Model):
                for key in added_tokens_json:
                    token_id = added_tokens_json[key]
                    if (token_id >= vocab_size):
-                        print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+                        logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                        continue

                    tokens[token_id] = key.encode("utf-8")
@@ -2208,8 +2198,7 @@ class PlamoModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            # shuffle for broadcasting of gqa in ggml_mul_mat
            if new_name.endswith("attn_q.weight"):
@@ -2240,7 +2229,7 @@ class PlamoModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

@@ -2286,8 +2275,7 @@ class CodeShellModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -2304,13 +2292,13 @@ class CodeShellModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

            if not has_lm_head and name == "transformer.wte.weight":
                self.gguf_writer.add_tensor("output.weight", data)
-                print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+                logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")


@Model.register("InternLM2ForCausalLM")
@@ -2332,7 +2320,7 @@ class InternLM2Model(Model):
        toktypes: list[int] = []

        if not tokenizer_path.is_file():
-            print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
+            logger.error(f'Error: Missing {tokenizer_path}')
            sys.exit(1)

        sentencepiece_model = model.ModelProto()
@@ -2349,7 +2337,7 @@ class InternLM2Model(Model):
            if text == b"\x00":
                # (TODO): fixme
                # Hack here and replace the \x00 characters.
-                print(f"InternLM2 convert token '{text}' to '🐉'!")
+                logger.debug(f"InternLM2 convert token '{text}' to '🐉'!")
                text = "🐉"

            toktype = SentencePieceTokenTypes.NORMAL
@@ -2390,7 +2378,7 @@ class InternLM2Model(Model):
            # TODO: this is a hack, should be fixed
            #       https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
            special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
-            print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
+            logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
 in chat mode so that the conversation can end normally.")

        special_vocab.add_to_gguf(self.gguf_writer)
@@ -2435,8 +2423,7 @@ in chat mode so that the conversation can end normally.")
        # map tensor names
        new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
        if new_name is None:
-            print(f"Can not map tensor {name!r}")
-            sys.exit()
+            raise ValueError(f"Can not map tensor {name!r}")

        n_dims = len(data.shape)
        data_dtype = data.dtype
@@ -2453,7 +2440,7 @@ in chat mode so that the conversation can end normally.")
        if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)

-        print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+        logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
        self.gguf_writer.add_tensor(new_name, data)

    def write_tensors(self):
@@ -2564,8 +2551,7 @@ class BertModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            # convert any unsupported data types to float32
            if data_torch.dtype not in (torch.float16, torch.float32):
@@ -2585,7 +2571,7 @@ class BertModel(Model):
                # if f32 desired, convert any float16 to float32
                new_dtype = np.float32

-            print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")

            if data.dtype != new_dtype:
                data = data.astype(new_dtype)
@@ -2664,7 +2650,7 @@ class GemmaModel(Model):
            # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
            # To prevent errors, skip loading lm_head.weight.
            if name == "lm_head.weight":
-                print(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
+                logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
                continue

            old_dtype = data_torch.dtype
@@ -2681,8 +2667,7 @@ class GemmaModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -2693,7 +2678,7 @@ class GemmaModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

@@ -2721,7 +2706,7 @@ class MambaModel(Model):
        else:
            # Use the GPT-NeoX tokenizer when no tokenizer files are present
            tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
-            print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
+            logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
            neox_reader = gguf.GGUFReader(tokenizer_path, "r")

            field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
@@ -2793,17 +2778,16 @@ class MambaModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            if name.endswith(".A_log"):
-                print("A_log --> A ==> " + new_name)
+                logger.debug("A_log --> A ==> " + new_name)
                data_torch = -torch.exp(data_torch)

            # assuming token_embd.weight is seen before output.weight
            if tok_embd is not None and new_name == output_name:
                if torch.equal(tok_embd, data_torch):
-                    print(f"{output_name} is equivalent to {tok_embd_name}, omitting")
+                    logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting")
                    continue
            if new_name == tok_embd_name:
                tok_embd = data_torch
@@ -2826,7 +2810,7 @@ class MambaModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

@@ -2856,8 +2840,9 @@ class OlmoModel(Model):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_layer_norm_eps(1e-5)
-        if "clip_qkv" in self.hparams is not None:
-            self.gguf_writer.add_clamp_kqv(self.hparams["clip_qkv"])
+        clip_qkv = self.hparams.get("clip_qkv")
+        if clip_qkv is not None:
+            self.gguf_writer.add_clamp_kqv(clip_qkv)

    # Same as super class, but permuting q_proj, k_proj
    # Copied from: LlamaModel
@@ -2885,8 +2870,7 @@ class OlmoModel(Model):
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
+                raise ValueError(f"Can not map tensor {name!r}")

            n_dims = len(data.shape)
            data_dtype = data.dtype
@@ -2903,7 +2887,7 @@ class OlmoModel(Model):
            if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2:
                data = data.astype(np.float16)

-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

            self.gguf_writer.add_tensor(new_name, data)

@@ -2936,6 +2920,7 @@ def parse_args() -> argparse.Namespace:
    )
    parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
    parser.add_argument("--model-name", type=str, default=None, help="name of the model")
+    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")

    return parser.parse_args()

@@ -2943,6 +2928,8 @@ def parse_args() -> argparse.Namespace:
 def main() -> None:
    args = parse_args()

+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
    dir_model = args.model

    if args.awq_path:
@@ -2951,15 +2938,15 @@ def main() -> None:
        tmp_model_path = args.model / "weighted_model"
        dir_model = tmp_model_path
        if tmp_model_path.is_dir():
-            print(f"{tmp_model_path} exists as a weighted model.")
+            logger.info(f"{tmp_model_path} exists as a weighted model.")
        else:
            tmp_model_path.mkdir(parents=True, exist_ok=True)
-            print("Saving new weighted model ...")
+            logger.info("Saving new weighted model ...")
            add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
-            print(f"Saved weighted model at {tmp_model_path}.")
+            logger.info(f"Saved weighted model at {tmp_model_path}.")

    if not dir_model.is_dir():
-        print(f'Error: {args.model} is not a directory', file=sys.stderr)
+        logger.error(f'Error: {args.model} is not a directory')
        sys.exit(1)

    ftype_map = {
@@ -2973,7 +2960,7 @@ def main() -> None:
        # output in the same directory as the model by default
        fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'

-    print(f"Loading model: {dir_model.name}")
+    logger.info(f"Loading model: {dir_model.name}")

    hparams = Model.load_hparams(dir_model)

@@ -2981,20 +2968,20 @@ def main() -> None:
        model_class = Model.from_model_architecture(hparams["architectures"][0])
        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)

-        print("Set model parameters")
+        logger.info("Set model parameters")
        model_instance.set_gguf_parameters()

-        print("Set model tokenizer")
+        logger.info("Set model tokenizer")
        model_instance.set_vocab()

        if args.vocab_only:
-            print(f"Exporting model vocab to '{fname_out}'")
+            logger.info(f"Exporting model vocab to '{fname_out}'")
            model_instance.write_vocab()
        else:
-            print(f"Exporting model to '{fname_out}'")
+            logger.info(f"Exporting model to '{fname_out}'")
            model_instance.write()

-        print(f"Model successfully exported to '{fname_out}'")
+        logger.info(f"Model successfully exported to '{fname_out}'")


 if __name__ == '__main__':
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 from __future__ import annotations

+import logging
 import argparse
 import os
 import struct
@@ -14,6 +15,8 @@ if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf

+logger = logging.getLogger("ggml-to-gguf")
+

 class GGMLFormat(IntEnum):
    GGML = 0
@@ -125,7 +128,6 @@ class Tensor:
        self.start_offset = offset
        self.len_bytes = n_bytes
        offset += n_bytes
-        # print(n_dims, name_len, dtype, self.dims, self.name, pad)
        return offset - orig_offset


@@ -175,7 +177,7 @@ class GGMLModel:
        offset += self.validate_header(data, offset)
        hp = Hyperparameters()
        offset += hp.load(data, offset)
-        print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
+        logger.info(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
        self.validate_conversion(hp.ftype)
        vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
        offset += vocab.load(data, offset, hp.n_vocab)
@@ -215,12 +217,12 @@ class GGMLToGGUF:
                    if float(hp.n_head) / float(x) == gqa:
                        n_kv_head = x
                assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
-                print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
+                logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
        self.n_kv_head = n_kv_head
        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)

    def save(self):
-        print('* Preparing to save GGUF file')
+        logger.info('* Preparing to save GGUF file')
        gguf_writer = gguf.GGUFWriter(
            self.cfg.output,
            gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
@@ -230,11 +232,11 @@ class GGMLToGGUF:
        if self.special_vocab is not None:
            self.special_vocab.add_to_gguf(gguf_writer)
        self.add_tensors(gguf_writer)
-        print("    gguf: write header")
+        logger.info("    gguf: write header")
        gguf_writer.write_header_to_file()
-        print("    gguf: write metadata")
+        logger.info("    gguf: write metadata")
        gguf_writer.write_kv_data_to_file()
-        print("    gguf: write tensors")
+        logger.info("    gguf: write tensors")
        gguf_writer.write_tensors_to_file()
        gguf_writer.close()

@@ -250,7 +252,7 @@ class GGMLToGGUF:
            name = cfg.name if cfg.name is not None else cfg.input.name
        except UnicodeDecodeError:
            name = None
-        print('* Adding model parameters and KV items')
+        logger.info('* Adding model parameters and KV items')
        if name is not None:
            gguf_writer.add_name(name)
        gguf_writer.add_description(desc)
@@ -287,7 +289,7 @@ class GGMLToGGUF:
        toktypes = []
        if self.vocab_override is not None:
            vo = self.vocab_override
-            print('* Adding vocab item(s)')
+            logger.info('* Adding vocab item(s)')
            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
                tokens.append(vbytes)
                scores.append(score)
@@ -299,7 +301,7 @@ class GGMLToGGUF:
            if len(toktypes) > 0:
                gguf_writer.add_token_types(toktypes)
            return
-        print(f'* Adding {hp.n_vocab} vocab item(s)')
+        logger.info(f'* Adding {hp.n_vocab} vocab item(s)')
        assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
        for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
            tt = 1 # Normal
@@ -334,7 +336,7 @@ class GGMLToGGUF:
    def add_tensors(self, gguf_writer):
        tensor_map = self.name_map
        data = self.data
-        print(f'* Adding {len(self.model.tensors)} tensor(s)')
+        logger.info(f'* Adding {len(self.model.tensors)} tensor(s)')
        for tensor in self.model.tensors:
            name = str(tensor.name, 'UTF-8')
            mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
@@ -344,7 +346,6 @@ class GGMLToGGUF:
                temp = tempdims[1]
                tempdims[1] = tempdims[0]
                tempdims[0] = temp
-            # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
            gguf_writer.add_tensor(
                mapped_name,
                data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
@@ -401,33 +402,35 @@ def handle_args():
                        help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
    parser.add_argument("--vocabtype", default="spm,hfft",
                        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
+    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
    return parser.parse_args()


 def main():
    cfg = handle_args()
-    print(f'* Using config: {cfg}')
-    print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
+    logging.basicConfig(level=logging.DEBUG if cfg.verbose else logging.INFO)
+    logger.info(f'* Using config: {cfg}')
+    logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===')
    if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
-        print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
+        logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
    data = np.memmap(cfg.input, mode = 'r')
    model = GGMLModel()
-    print('* Scanning GGML input file')
+    logger.info('* Scanning GGML input file')
    offset = model.load(data, 0)  # noqa
-    print(f'* GGML model hyperparameters: {model.hyperparameters}')
+    logger.info(f'* GGML model hyperparameters: {model.hyperparameters}')
    vocab_override = None
    params_override = None
    special_vocab = None
    if cfg.model_metadata_dir is not None:
        (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
-        print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
-        print(f'* Overriding params: {params_override}')
-        print(f'* Overriding vocab: {vocab_override}')
-        print(f'* Special vocab: {special_vocab}')
+        logger.info('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
+        logger.info(f'* Overriding params: {params_override}')
+        logger.info(f'* Overriding vocab: {vocab_override}')
+        logger.info(f'* Special vocab: {special_vocab}')
    else:
-        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
+        logger.warning('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
        if model.file_format == GGMLFormat.GGML:
-            print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
+            logger.info('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
    converter = GGMLToGGUF(
        model, data, cfg,
        params_override = params_override,
@@ -435,7 +438,7 @@ def main():
        special_vocab = special_vocab
    )
    converter.save()
-    print(f'* Successful completion. Output saved to: {cfg.output}')
+    logger.info(f'* Successful completion. Output saved to: {cfg.output}')


 if __name__ == '__main__':
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 from __future__ import annotations

+import logging
 import json
 import os
 import struct
@@ -15,6 +16,9 @@ if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf

+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger("lora-to-gguf")
+
 NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}


@@ -48,11 +52,9 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty

 if __name__ == '__main__':
    if len(sys.argv) < 2:
-        print(f"Usage: python {sys.argv[0]} <path> [arch]")
-        print(
-            "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
-        )
-        print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
+        logger.info(f"Usage: python {sys.argv[0]} <path> [arch]")
+        logger.info("Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'")
+        logger.info(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
        sys.exit(1)

    input_json = os.path.join(sys.argv[1], "adapter_config.json")
@@ -70,7 +72,7 @@ if __name__ == '__main__':
    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"

    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
-        print(f"Error: unsupported architecture {arch_name}")
+        logger.error(f"Error: unsupported architecture {arch_name}")
        sys.exit(1)

    arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
@@ -80,21 +82,21 @@ if __name__ == '__main__':
        params = json.load(f)

    if params["peft_type"] != "LORA":
-        print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
+        logger.error(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
        sys.exit(1)

    if params["fan_in_fan_out"] is True:
-        print("Error: param fan_in_fan_out is not supported")
+        logger.error("Error: param fan_in_fan_out is not supported")
        sys.exit(1)

    if params["bias"] is not None and params["bias"] != "none":
-        print("Error: param bias is not supported")
+        logger.error("Error: param bias is not supported")
        sys.exit(1)

    # TODO: these seem to be layers that have been trained but without lora.
    # doesn't seem widely used but eventually should be supported
    if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
-        print("Error: param modules_to_save is not supported")
+        logger.error("Error: param modules_to_save is not supported")
        sys.exit(1)

    with open(output_path, "wb") as fout:
@@ -125,13 +127,13 @@ if __name__ == '__main__':
                suffix = k[-len(lora_suffixes[0]):]
                k = k[: -len(lora_suffixes[0])]
            else:
-                print(f"Error: unrecognized tensor name {orig_k}")
+                logger.error(f"Error: unrecognized tensor name {orig_k}")
                sys.exit(1)

            tname = name_map.get_name(k)
            if tname is None:
-                print(f"Error: could not map tensor name {orig_k}")
-                print(" Note: the arch parameter must be specified if the model is not llama")
+                logger.error(f"Error: could not map tensor name {orig_k}")
+                logger.error(" Note: the arch parameter must be specified if the model is not llama")
                sys.exit(1)

            if suffix == ".lora_A.weight":
@@ -141,8 +143,8 @@ if __name__ == '__main__':
            else:
                assert False

-            print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
+            logger.info(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
            write_tensor_header(fout, tname, t.shape, t.dtype)
            t.tofile(fout)

-    print(f"Converted {input_json} and {input_model} to {output_path}")
+    logger.info(f"Converted {input_json} and {input_model} to {output_path}")
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 from __future__ import annotations

+import logging
 import argparse
 import os
 import sys
@@ -14,6 +15,8 @@ if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf

+logger = logging.getLogger("persimmon-to-gguf")
+

 def _flatten_dict(dct, tensors, prefix=None):
    assert isinstance(dct, dict)
@@ -30,9 +33,9 @@ def _flatten_dict(dct, tensors, prefix=None):

 def _get_sentencepiece_tokenizer_info(dir_model: Path):
    tokenizer_path = dir_model / 'adept_vocab.model'
-    print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
+    logger.info('getting sentencepiece tokenizer from', tokenizer_path)
    tokenizer = SentencePieceProcessor(str(tokenizer_path))
-    print('gguf: adding tokens')
+    logger.info('adding tokens')
    tokens: list[bytes] = []
    scores: list[float] = []
    toktypes: list[int] = []
@@ -67,8 +70,10 @@ def main():
    parser.add_argument("--outfile",             type=Path, help="path to write to; default: based on input")
    parser.add_argument("--ckpt-path",           type=Path, help="path to persimmon checkpoint .pt file")
    parser.add_argument("--model-dir",           type=Path, help="directory containing model e.g. 8b_chat_model_release")
-    parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
+    parser.add_argument("--adept-inference-dir", type=str,  help="path to adept-inference code directory")
+    parser.add_argument("--verbose",  action="store_true",  help="increase output verbosity")
    args = parser.parse_args()
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
    sys.path.append(str(args.adept_inference_dir))
    persimmon_model = torch.load(args.ckpt_path)
    hparams = persimmon_model['args']
@@ -107,7 +112,7 @@ def main():
    gguf_writer.add_eos_token_id(71013)

    tensor_map = gguf.get_tensor_name_map(arch, block_count)
-    print(tensor_map)
+    logger.info(tensor_map)
    for name in tensors.keys():
        data_torch = tensors[name]
        if name.endswith(".self_attention.rotary_emb.inv_freq"):
@@ -117,22 +122,21 @@ def main():
        data = data_torch.to(torch.float32).squeeze().numpy()
        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
        if new_name is None:
-            print("Can not map tensor '" + name + "'")
-            sys.exit()
+            raise ValueError(f"Can not map tensor '{name}'")
+
        n_dims = len(data.shape)
-        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+        logger.debug(f"{new_name}, n_dims = {str(n_dims)}, {str(old_dtype)} --> {str(data.dtype)}")
        gguf_writer.add_tensor(new_name, data)
-    print("gguf: write header")
+    logger.info("gguf: write header")
    gguf_writer.write_header_to_file()
-    print("gguf: write metadata")
+    logger.info("gguf: write metadata")
    gguf_writer.write_kv_data_to_file()
-    print("gguf: write tensors")
+    logger.info("gguf: write tensors")
    gguf_writer.write_tensors_to_file()

    gguf_writer.close()

-    print(f"gguf: model successfully exported to '{args.outfile}'")
-    print("")
+    logger.info(f"gguf: model successfully exported to '{args.outfile}'")


 if __name__ == '__main__':
--- a/convert.py
+++ b/convert.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 from __future__ import annotations

+import logging
 import argparse
 import concurrent.futures
 import enum
@@ -35,6 +36,8 @@ import gguf
 if TYPE_CHECKING:
    from typing_extensions import Self, TypeAlias

+logger = logging.getLogger("convert")
+
 if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
    faulthandler.register(signal.SIGUSR1)

@@ -643,7 +646,6 @@ class LlamaHfVocab(Vocab):


 def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
-    # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
    if n_head_kv is not None and n_head != n_head_kv:
        n_head = n_head_kv
    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
@@ -1033,12 +1035,12 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False)

    # Check for a vocab size mismatch
    if params.n_vocab == vocab.vocab_size:
-        print("Ignoring added_tokens.json since model matches vocab size without it.")
+        logger.warning("Ignoring added_tokens.json since model matches vocab size without it.")
        return

    if pad_vocab and params.n_vocab > vocab.vocab_size:
        pad_count = params.n_vocab - vocab.vocab_size
-        print(
+        logger.debug(
            f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
        )
        for i in range(1, pad_count + 1):
@@ -1166,7 +1168,7 @@ class OutputFile:
            elapsed = time.time() - start
            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
            padi = len(str(len(model)))
-            print(
+            logger.info(
                f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
            )
            self.gguf.write_tensor_data(ndarray)
@@ -1281,12 +1283,12 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
    # HF models permut or pack some of the tensors, so we need to undo that
    for i in itertools.count():
        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
-            print(f"Permuting layer {i}")
+            logger.debug(f"Permuting layer {i}")
            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
            # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
-            print(f"Unpacking and permuting layer {i}")
+            logger.debug(f"Unpacking and permuting layer {i}")
            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
            tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy        (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
@@ -1299,15 +1301,15 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
        tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
        if name_new is None:
            if skip_unknown:
-                print(f"Unexpected tensor name: {name} - skipping")
+                logger.warning(f"Unexpected tensor name: {name} - skipping")
                continue
            raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")

        if tensor_type in should_skip:
-            print(f"skipping tensor {name_new}")
+            logger.debug(f"skipping tensor {name_new}")
            continue

-        print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
+        logger.debug(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
        out[name_new] = lazy_tensor

    return out
@@ -1372,7 +1374,7 @@ def load_some_model(path: Path) -> ModelPlus:
    paths = find_multifile_paths(path)
    models_plus: list[ModelPlus] = []
    for path in paths:
-        print(f"Loading model file {path}")
+        logger.info(f"Loading model file {path}")
        models_plus.append(lazy_load_file(path))

    model_plus = merge_multifile_models(models_plus)
@@ -1413,7 +1415,7 @@ class VocabFactory:
        else:
            raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")

-        print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
+        logger.info(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
        return vocab

    def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
@@ -1438,19 +1440,19 @@ def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
    }[file_type]
    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
    if ret in model_paths:
-        sys.stderr.write(
+        logger.error(
            f"Error: Default output path ({ret}) would overwrite the input. "
-            "Please explicitly specify a path using --outfile.\n")
+            "Please explicitly specify a path using --outfile.")
        sys.exit(1)
    return ret


 def do_dump_model(model_plus: ModelPlus) -> None:
-    print(f"model_plus.paths = {model_plus.paths!r}")
-    print(f"model_plus.format = {model_plus.format!r}")
-    print(f"model_plus.vocab = {model_plus.vocab!r}")
+    print(f"model_plus.paths = {model_plus.paths!r}") # noqa: NP100
+    print(f"model_plus.format = {model_plus.format!r}") # noqa: NP100
+    print(f"model_plus.vocab = {model_plus.vocab!r}") # noqa: NP100
    for name, lazy_tensor in model_plus.model.items():
-        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
+        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100


 def main(args_in: list[str] | None = None) -> None:
@@ -1473,8 +1475,18 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--big-endian",   action="store_true",    help="model is executed on big endian machine")
    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
+    parser.add_argument("--verbose",      action="store_true",    help="increase output verbosity")

    args = parser.parse_args(args_in)
+
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    elif args.dump_single or args.dump:
+        # Avoid printing anything besides the dump output
+        logging.basicConfig(level=logging.WARNING)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
    if args.no_vocab and args.vocab_only:
        raise ValueError("--vocab-only does not make sense with --no-vocab")

@@ -1491,29 +1503,32 @@ def main(args_in: list[str] | None = None) -> None:
    if args.dump:
        do_dump_model(model_plus)
        return
+
    endianess = gguf.GGUFEndian.LITTLE
    if args.big_endian:
        endianess = gguf.GGUFEndian.BIG

-    params = Params.load(model_plus)
-    if params.n_ctx == -1:
-        if args.ctx is None:
-            msg = """\
-                The model doesn't have a context size, and you didn't specify one with --ctx
-                Please specify one with --ctx:
-                 - LLaMA v1: --ctx 2048
-                 - LLaMA v2: --ctx 4096"""
-            parser.error(textwrap.dedent(msg))
-        params.n_ctx = args.ctx
+    params = None
+    if args.pad_vocab or not args.vocab_only:
+        params = Params.load(model_plus)
+        if params.n_ctx == -1:
+            if args.ctx is None:
+                msg = """\
+                    The model doesn't have a context size, and you didn't specify one with --ctx
+                    Please specify one with --ctx:
+                     - LLaMA v1: --ctx 2048
+                     - LLaMA v2: --ctx 4096"""
+                parser.error(textwrap.dedent(msg))
+            params.n_ctx = args.ctx

-    if args.outtype:
-        params.ftype = {
-            "f32": GGMLFileType.AllF32,
-            "f16": GGMLFileType.MostlyF16,
-            "q8_0": GGMLFileType.MostlyQ8_0,
-        }[args.outtype]
+        if args.outtype:
+            params.ftype = {
+                "f32": GGMLFileType.AllF32,
+                "f16": GGMLFileType.MostlyF16,
+                "q8_0": GGMLFileType.MostlyQ8_0,
+            }[args.outtype]

-    print(f"params = {params}")
+        logger.info(f"params = {params}")

    model_parent_path = model_plus.paths[0].parent
    vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
@@ -1526,17 +1541,27 @@ def main(args_in: list[str] | None = None) -> None:
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        outfile = args.outfile
+        if params is None:
+            params = Params(
+                n_vocab    = vocab.vocab_size,
+                n_embd     = 1,
+                n_layer    = 1,
+                n_ctx      = 1,
+                n_ff       = 1,
+                n_head     = 1,
+                n_head_kv  = 1,
+                f_norm_eps = 1e-5,
+            )
        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
                                    endianess=endianess, pad_vocab=args.pad_vocab)
-        print(f"Wrote {outfile}")
+        logger.info(f"Wrote {outfile}")
        return

    if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
        vocab = model_plus.vocab

-    print(f"Vocab info: {vocab}")
-    print(f"Special vocab info: {special_vocab}")
-
+    logger.info(f"Vocab info: {vocab}")
+    logger.info(f"Special vocab info: {special_vocab}")
    model   = model_plus.model
    model   = convert_model_names(model, params, args.skip_unknown)
    ftype   = pick_output_type(model, args.outtype)
@@ -1544,11 +1569,11 @@ def main(args_in: list[str] | None = None) -> None:
    outfile = args.outfile or default_outfile(model_plus.paths, ftype)

    params.ftype = ftype
-    print(f"Writing {outfile}, format {ftype}")
+    logger.info(f"Writing {outfile}, format {ftype}")

    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
-    print(f"Wrote {outfile}")
+    logger.info(f"Wrote {outfile}")


 if __name__ == '__main__':
--- a/docs/BLIS.md
+++ b/docs/BLIS.md
@@ -23,7 +23,7 @@ Install BLIS:
 sudo make install
 ```

-We recommend using openmp since it's easier to modify the cores been used.
+We recommend using openmp since it's easier to modify the cores being used.

 ### llama.cpp compilation

--- a/docs/HOWTO-add-model.md
+++ b/docs/HOWTO-add-model.md
@@ -96,9 +96,9 @@ NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorc

 This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.

-Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
+Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`.

-When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
+When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.

 Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).

--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);

    auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
-        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
+        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
            return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
        } else if (a->type == GGML_TYPE_F32) {
            return ggml_add(ctx, a, b);
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -32,6 +32,7 @@ struct split_params {
    int n_split_tensors = 128;
    std::string input;
    std::string output;
+    bool no_tensor_first_split = false;
    bool dry_run = false;
 };

@@ -49,6 +50,7 @@ static void split_print_usage(const char * executable) {
    printf("  --merge                 merge multiple GGUF to a single GGUF\n");
    printf("  --split-max-tensors     max tensors in each split (default: %d)\n", default_params.n_split_tensors);
    printf("  --split-max-size N(M|G) max size per split\n");
+    printf("  --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
    printf("  --dry-run               only print out a split plan and exit, without writing any new files\n");
    printf("\n");
 }
@@ -100,6 +102,10 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
            arg_found = true;
            params.dry_run = true;
        }
+        if (arg == "--no-tensor-first-split") {
+            arg_found = true;
+            params.no_tensor_first_split = true;
+        }

        if (is_op_set) {
            throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
@@ -200,10 +206,10 @@ struct split_strategy {
        // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
        int i_split = -1;
        struct gguf_context * ctx_out = NULL;
-        auto new_ctx_out = [&]() {
+        auto new_ctx_out = [&](bool allow_no_tensors) {
            i_split++;
            if (ctx_out != NULL) {
-                if (gguf_get_n_tensors(ctx_out) == 0) {
+                if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) {
                    fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
                    exit(EXIT_FAILURE);
                }
@@ -220,7 +226,12 @@ struct split_strategy {
        };

        // initialize ctx_out for the first split
-        new_ctx_out();
+        new_ctx_out(false);
+
+        // skip first split if no_tensor_first_split is set
+        if (params.no_tensor_first_split) {
+            new_ctx_out(true);
+        }

        // process tensors one by one
        size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
@@ -230,7 +241,7 @@ struct split_strategy {
            size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
            size_t next_tensors_size = curr_tensors_size + n_bytes;
            if (should_split(i, next_tensors_size)) {
-                new_ctx_out();
+                new_ctx_out(false);
                curr_tensors_size = n_bytes;
            } else {
                curr_tensors_size = next_tensors_size;
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@@ -55,15 +55,15 @@ $MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
 echo PASS
 echo

-# 4. Split with no tensor in metadata
-#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
-#echo PASS
-#echo
+# 4. Split with no tensors in the first split
+$SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
+echo PASS
+echo

 # 4b. Test the sharded model is loading properly
-#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32
-#echo PASS
-#echo
+$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
+echo PASS
+echo

 # 5. Merge
 #$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -19,6 +19,7 @@

 struct Stats {
    std::vector<float> values;
+    std::vector<int> counts;
    int ncall = 0;
 };

@@ -121,12 +122,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        auto & e = m_stats[wname];

        ++e.ncall;
-        // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
-        //       using the following line, we can correct for that if needed by replacing the line above with:
-        //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;

        if (e.values.empty()) {
            e.values.resize(src1->ne[0]*n_as, 0);
+            e.counts.resize(src1->ne[0]*n_as, 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
@@ -153,6 +152,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *

                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
                        e.values[e_start + j] += x[j]*x[j];
+                        e.counts[e_start + j]++;
                    }
                }
            }
@@ -170,6 +170,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        auto& e = m_stats[wname];
        if (e.values.empty()) {
            e.values.resize(src1->ne[0], 0);
+            e.counts.resize(src1->ne[0], 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]) {
            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
@@ -183,6 +184,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            const float * x = data + row * src1->ne[0];
            for (int j = 0; j < (int)src1->ne[0]; ++j) {
                e.values[j] += x[j]*x[j];
+                e.counts[j]++;
            }
        }
        if (e.ncall > m_last_call) {
@@ -222,7 +224,13 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
        out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
        int nval = p.second.values.size();
        out.write((const char *) &nval, sizeof(nval));
-        if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
+        if (nval > 0) {
+            std::vector<float> tmp(nval);
+            for (int i = 0; i < nval; i++) {
+                tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
+            }
+            out.write((const char*)tmp.data(), nval*sizeof(float));
+        }
    }

    // Write the number of call the matrix was computed with
@@ -270,14 +278,28 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
            imatrix_data = {};
            return false;
        }
-        e.values.resize(nval);
-        in.read((char*)e.values.data(), nval*sizeof(float));
+
+        // When re-called from load_imatrix() with add set, this will already be created.
+        if (e.values.empty()) {
+            e.values.resize(nval, 0);
+            e.counts.resize(nval, 0);
+        }
+
+        std::vector<float> tmp(nval);
+        in.read((char*)tmp.data(), nval*sizeof(float));
        if (in.fail()) {
            printf("%s: failed reading data for entry %d\n",__func__,i);
            imatrix_data = {};
            return false;
        }
-        e.ncall = ncall;
+
+        // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
+        for (int i = 0; i < nval; i++) {
+            e.values[i] += tmp[i];
+            e.counts[i] += ncall;
+        }
+        e.ncall += ncall;
+
    }
    return true;
 }
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -178,6 +178,7 @@ struct cmd_params {
    std::vector<std::vector<float>> tensor_split;
    std::vector<bool> use_mmap;
    std::vector<bool> embeddings;
+    ggml_numa_strategy numa;
    int reps;
    bool verbose;
    output_formats output_format;
@@ -200,6 +201,7 @@ static const cmd_params cmd_params_defaults = {
    /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
    /* use_mmap      */ {true},
    /* embeddings    */ {false},
+    /* numa          */ GGML_NUMA_STRATEGY_DISABLED,
    /* reps          */ 5,
    /* verbose       */ false,
    /* output_format */ MARKDOWN
@@ -224,6 +226,7 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
    printf("  -fa, --flash-attn <0|1>             (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
    printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
+    printf("  --numa <distribute|isolate|numactl> (default: disabled)\n");
    printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
    printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
    printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
@@ -396,6 +399,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<bool>(argv[i], split_delim);
            params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
+        } else if (arg == "--numa") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            } else {
+                std::string value(argv[i]);
+                /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
+                else if (value == "isolate")                    { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+                else if (value == "numactl")                    { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+                else { invalid_param = true; break; }
+            }
        } else if (arg == "-fa" || arg == "--flash-attn") {
            if (++i >= argc) {
                invalid_param = true;
@@ -1215,6 +1229,7 @@ int main(int argc, char ** argv) {
        llama_log_set(llama_null_log_callback, NULL);
    }
    llama_backend_init();
+    llama_numa_init(params.numa);

    // initialize printer
    std::unique_ptr<printer> p;
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -56,7 +56,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-pa
 python ./convert.py ../llava-v1.5-7b --skip-unknown
 ```

-Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
+Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.

 ## LLaVA 1.6 gguf conversion
 1) First clone a LLaVA 1.6 model:
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -104,7 +104,6 @@ static std::string format(const char * fmt, ...) {
 #define TN_POS_EMBD        "%s.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
 #define TN_PATCH_EMBD      "v.patch_embd.weight"
-#define TN_PATCH_BIAS      "v.patch_embd.bias"
 #define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
 #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
 #define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
@@ -426,7 +425,6 @@ struct clip_vision_model {
    // embeddings
    struct ggml_tensor * class_embedding;
    struct ggml_tensor * patch_embeddings;
-    struct ggml_tensor * patch_bias;
    struct ggml_tensor * position_embeddings;

    struct ggml_tensor * pre_ln_w;
@@ -503,11 +501,6 @@ struct clip_ctx {
    bool use_gelu = false;
    int32_t ftype = 1;

-    bool has_class_embedding = true;
-    bool has_pre_norm = true;
-    bool has_post_norm = false;
-    bool has_patch_bias = false;
-
    struct gguf_context * ctx_gguf;
    struct ggml_context * ctx_data;

@@ -533,7 +526,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    const int patch_size           = hparams.patch_size;
    const int num_patches          = ((image_size / patch_size) * (image_size / patch_size));
    const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
-    const int num_positions        = num_patches + (ctx->has_class_embedding ? 1 : 0);
+    const int num_positions        = num_patches + 1;
    const int hidden_size          = hparams.hidden_size;
    const int n_head               = hparams.n_head;
    const int d_head               = hidden_size / n_head;
@@ -564,23 +557,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));

-    if (ctx->has_patch_bias) {
-        // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
-        inp = ggml_add(ctx0, inp, model.patch_bias);
-    }
-
    // concat class_embeddings and patch_embeddings
-    struct ggml_tensor * embeddings = inp;
-    if (ctx->has_class_embedding) {
-        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
-                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-        embeddings = ggml_acc(ctx0, embeddings, inp,
-                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
-    }
+    struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
    ggml_set_name(embeddings, "embeddings");
    ggml_set_input(embeddings);

+    embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
+            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+
+    embeddings = ggml_acc(ctx0, embeddings, inp,
+            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);

    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
    ggml_set_name(positions, "positions");
@@ -590,7 +576,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));

    // pre-layernorm
-    if (ctx->has_pre_norm) {
+    {
        embeddings = ggml_norm(ctx0, embeddings, eps);
        ggml_set_name(embeddings, "pre_ln");

@@ -678,14 +664,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        embeddings = cur;
    }

-    // post-layernorm
-    if (ctx->has_post_norm) {
-        embeddings = ggml_norm(ctx0, embeddings, eps);
-        ggml_set_name(embeddings, "post_ln");
-
-        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
-    }
-
    // llava projector
    {
        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@@ -1170,39 +1148,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

        }

-        try {
-            vision_model.class_embedding  = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
-            new_clip->has_class_embedding = true;
-        } catch (const std::exception& e) {
-            new_clip->has_class_embedding = false;
-        }
-
-        try {
-            vision_model.pre_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
-            vision_model.pre_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
-            new_clip->has_pre_norm = true;
-        } catch (std::exception & e) {
-            new_clip->has_pre_norm = false;
-        }
-
-        try {
-            vision_model.post_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
-            vision_model.post_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
-            new_clip->has_post_norm = true;
-        } catch (std::exception & e) {
-            new_clip->has_post_norm = false;
-        }
-
-        try {
-            vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
-            new_clip->has_patch_bias = true;
-        } catch (std::exception & e) {
-            new_clip->has_patch_bias = false;
-        }
-
        try {
            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
+            vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
+            vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
+            vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
        } catch(const std::exception& e) {
            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
        }
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -143,7 +143,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by

 ### Extended Context Size

-Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
+Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model has a context length (max sequence length) of 4096 (4k) and the fine-tuned model has 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.

 -   `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.

@@ -286,7 +286,7 @@ These options help improve the performance and memory usage of the LLaMA models.

 -   `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
 -   `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
-   `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitraty core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
+-   `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitrary core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.

 These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -362,6 +362,9 @@ int main(int argc, char ** argv) {
        params.interactive_first = true;
        params.antiprompt.emplace_back("<|im_start|>user\n");
    }
+    else if (params.conversation) {
+        params.interactive_first = true;
+    }

    // enable interactive mode if interactive start is specified
    if (params.interactive_first) {
@@ -733,7 +736,7 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo && display) {
            for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id);
+                const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
                printf("%s", token_str.c_str());

                if (embd.size() > 1) {
@@ -796,7 +799,7 @@ int main(int argc, char ** argv) {

            // deal with end of generation tokens in interactive mode
            if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
-                LOG("found EOS token\n");
+                LOG("found an EOG token\n");

                if (params.interactive) {
                    if (!params.antiprompt.empty()) {
@@ -816,7 +819,7 @@ int main(int argc, char ** argv) {
            if (n_past > 0 && is_interacting) {
                LOG("waiting for user input\n");

-                if (params.instruct || params.chatml) {
+                if (params.conversation || params.instruct || params.chatml) {
                    printf("\n> ");
                }

@@ -826,7 +829,7 @@ int main(int argc, char ** argv) {
                }

                std::string buffer;
-                if (!params.input_prefix.empty()) {
+                if (!params.input_prefix.empty() && !params.conversation) {
                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                    printf("%s", params.input_prefix.c_str());
                }
@@ -850,7 +853,7 @@ int main(int argc, char ** argv) {
                // Entering a empty line lets the user pass control back
                if (buffer.length() > 1) {
                    // append input suffix if any
-                    if (!params.input_suffix.empty()) {
+                    if (!params.input_suffix.empty() && !params.conversation) {
                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                        printf("%s", params.input_suffix.c_str());
                    }
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -46,7 +46,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
-    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
+    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, -0.0020 ppl @ Mistral-7B", },
+    { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B", },
    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
    // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -62,6 +62,18 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
 - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name.  Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
 - `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
 - `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
+- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn`
+- `--rope-freq-base N` : RoPE frequency base (default: loaded from model)
+- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25)
+- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation)
+- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
+- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0)
+- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0)
+- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls`
+- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled)
+- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
+- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
+- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)

 **If compiled with `LLAMA_SERVER_SSL=ON`**
 - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
@@ -260,7 +272,7 @@ node index.js

    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`

-    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token. Default: `0`
+    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`

    `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`

@@ -319,7 +331,7 @@ Notice that each `probs` is an array of length `n_probs`.

    `content`: Set the text to tokenize.

-    Note that a special `BOS` token is never inserted.
+    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`

 - **POST** `/detokenize`: Convert tokens to text.

--- a/examples/server/public/favicon.ico
+++ b/examples/server/public/favicon.ico
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -12,6 +12,8 @@
 // increase max payload length to allow use of larger context size
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
 #include "httplib.h"
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"

 // auto generated files (update with ./deps.sh)
@@ -859,7 +861,7 @@ struct server_context {
        slot.sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);

        // process "json_schema" and "grammar"
-        if (data.contains("json_schema") && !data["json_schema"].is_null() && data.contains("grammar") && !data["grammar"].is_null()) {
+        if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
            send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
            return false;
        } else if (data.contains("json_schema") && !data.contains("grammar")) {
@@ -1383,9 +1385,10 @@ struct server_context {
            if (!slot.params.stream && slot.stopped_word) {
                const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);

+                size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
                probs = std::vector<completion_token_output>(
                        slot.generated_token_probs.begin(),
-                        slot.generated_token_probs.end() - stop_word_toks.size());
+                        slot.generated_token_probs.end() - safe_offset);
            } else {
                probs = std::vector<completion_token_output>(
                        slot.generated_token_probs.begin(),
@@ -1511,7 +1514,7 @@ struct server_context {
        // add subtasks
        for (int i = 0; i < prompt_count; i++) {
            json subtask_data = multiprompt_task.data;
-            subtask_data["prompt"] = subtask_data["prompt"][i];
+            subtask_data["prompt"] = subtask_data.at("prompt")[i];

            // subtasks inherit everything else (infill mode, embedding mode, etc.)
            request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding);
@@ -1531,7 +1534,7 @@ struct server_context {
                    }

                    if (task.data.contains("system_prompt")) {
-                        system_prompt_set(task.data["system_prompt"]);
+                        system_prompt_set(task.data.at("system_prompt"));

                        for (server_slot & slot : slots) {
                            slot.n_past    = 0;
@@ -1643,7 +1646,7 @@ struct server_context {
                } break;
            case SERVER_TASK_TYPE_SLOT_SAVE:
                {
-                    int id_slot = task.data["id_slot"];
+                    int id_slot = task.data.at("id_slot");
                    server_slot * slot = get_slot(id_slot);
                    if (slot == nullptr) {
                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -1653,8 +1656,8 @@ struct server_context {
                    const size_t token_count = slot->cache_tokens.size();
                    const int64_t t_start = ggml_time_us();

-                    std::string filename = task.data["filename"];
-                    std::string filepath = task.data["filepath"];
+                    std::string filename = task.data.at("filename");
+                    std::string filepath = task.data.at("filepath");

                    const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);

@@ -1678,7 +1681,7 @@ struct server_context {
                } break;
            case SERVER_TASK_TYPE_SLOT_RESTORE:
                {
-                    int id_slot = task.data["id_slot"];
+                    int id_slot = task.data.at("id_slot");
                    server_slot * slot = get_slot(id_slot);
                    if (slot == nullptr) {
                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -1687,8 +1690,8 @@ struct server_context {

                    const int64_t t_start = ggml_time_us();

-                    std::string filename = task.data["filename"];
-                    std::string filepath = task.data["filepath"];
+                    std::string filename = task.data.at("filename");
+                    std::string filepath = task.data.at("filepath");

                    slot->cache_tokens.resize(slot->n_ctx);
                    size_t token_count = 0;
@@ -1720,7 +1723,7 @@ struct server_context {
                } break;
            case SERVER_TASK_TYPE_SLOT_ERASE:
                {
-                    int id_slot = task.data["id_slot"];
+                    int id_slot = task.data.at("id_slot");
                    server_slot * slot = get_slot(id_slot);
                    if (slot == nullptr) {
                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -2265,17 +2268,31 @@ struct server_context {
                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
                result.tok = id;

-                const int32_t n_probs = slot.sparams.n_probs;
-                if (slot.sparams.temp <= 0 && n_probs > 0) {
-                    // for llama_sample_token_greedy we need to sort candidates
-                    llama_sample_softmax(ctx, &cur_p);
-                }
+                const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
+                if (n_probs > 0) {
+                    const size_t n_considered = slot.ctx_sampling->n_considered;

-                for (size_t i = 0; i < std::min(cur_p.size, (size_t) n_probs); ++i) {
-                    result.probs.push_back({
-                        cur_p.data[i].id,
-                        cur_p.data[i].p
-                    });
+                    // Make sure at least n_probs top tokens are at the front of the vector:
+                    if (slot.sparams.temp == 0.0f && n_probs > n_considered) {
+                        llama_sample_top_k(ctx, &cur_p, n_probs, 0);
+                    }
+
+                    if (slot.sparams.temp == 0.0f) {
+                        // With greedy sampling the probabilities have possibly not been calculated.
+                        for (size_t i = 0; i < n_probs; ++i) {
+                            result.probs.push_back({
+                                cur_p.data[i].id,
+                                i == 0 ? 1.0f : 0.0f
+                            });
+                        }
+                    } else {
+                        for (size_t i = 0; i < n_probs; ++i) {
+                            result.probs.push_back({
+                                cur_p.data[i].id,
+                                i >= n_considered ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
+                            });
+                        }
+                    }
                }

                if (!process_token(result, slot)) {
@@ -3121,8 +3138,8 @@ int main(int argc, char ** argv) {
                    server_task_result result = ctx_server.queue_results.recv(task.id);
                    ctx_server.queue_results.remove_waiting_task_id(task.id);

-                    const int n_idle_slots       = result.data["idle"];
-                    const int n_processing_slots = result.data["processing"];
+                    const int n_idle_slots       = result.data.at("idle");
+                    const int n_processing_slots = result.data.at("processing");

                    json health = {
                        {"status",           "ok"},
@@ -3132,7 +3149,7 @@ int main(int argc, char ** argv) {

                    res.status = 200; // HTTP OK
                    if (sparams.slots_endpoint && req.has_param("include_slots")) {
-                        health["slots"] = result.data["slots"];
+                        health["slots"] = result.data.at("slots");
                    }

                    if (n_idle_slots == 0) {
@@ -3176,7 +3193,7 @@ int main(int argc, char ** argv) {
        server_task_result result = ctx_server.queue_results.recv(task.id);
        ctx_server.queue_results.remove_waiting_task_id(task.id);

-        res.set_content(result.data["slots"].dump(), "application/json");
+        res.set_content(result.data.at("slots").dump(), "application/json");
        res.status = 200; // HTTP OK
    };

@@ -3203,32 +3220,32 @@ int main(int argc, char ** argv) {

        json data = result.data;

-        const uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
-        const uint64_t t_prompt_processing       = data["t_prompt_processing"];
+        const uint64_t n_prompt_tokens_processed = data.at("n_prompt_tokens_processed");
+        const uint64_t t_prompt_processing       = data.at("t_prompt_processing");

-        const uint64_t n_tokens_predicted  = data["n_tokens_predicted"];
-        const uint64_t t_tokens_generation = data["t_tokens_generation"];
+        const uint64_t n_tokens_predicted  = data.at("n_tokens_predicted");
+        const uint64_t t_tokens_generation = data.at("t_tokens_generation");

-        const int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
+        const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells");

        // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
        json all_metrics_def = json {
            {"counter", {{
                    {"name",  "prompt_tokens_total"},
                    {"help",  "Number of prompt tokens processed."},
-                    {"value",  (uint64_t) data["n_prompt_tokens_processed_total"]}
+                    {"value",  (uint64_t) data.at("n_prompt_tokens_processed_total")}
            }, {
                    {"name",  "prompt_seconds_total"},
                    {"help",  "Prompt process time"},
-                    {"value",  (uint64_t) data["t_prompt_processing_total"] / 1.e3}
+                    {"value",  (uint64_t) data.at("t_prompt_processing_total") / 1.e3}
            }, {
                    {"name",  "tokens_predicted_total"},
                    {"help",  "Number of generation tokens processed."},
-                    {"value",  (uint64_t) data["n_tokens_predicted_total"]}
+                    {"value",  (uint64_t) data.at("n_tokens_predicted_total")}
            }, {
                    {"name",  "tokens_predicted_seconds_total"},
                    {"help",  "Predict process time"},
-                    {"value",  (uint64_t) data["t_tokens_generation_total"] / 1.e3}
+                    {"value",  (uint64_t) data.at("t_tokens_generation_total") / 1.e3}
            }}},
            {"gauge", {{
                    {"name",  "prompt_tokens_seconds"},
@@ -3245,15 +3262,15 @@ int main(int argc, char ** argv) {
            },{
                    {"name",  "kv_cache_tokens"},
                    {"help",  "KV-cache tokens."},
-                    {"value",  (uint64_t) data["kv_cache_tokens_count"]}
+                    {"value",  (uint64_t) data.at("kv_cache_tokens_count")}
            },{
                    {"name",  "requests_processing"},
                    {"help",  "Number of request processing."},
-                    {"value",  (uint64_t) data["processing"]}
+                    {"value",  (uint64_t) data.at("processing")}
            },{
                    {"name",  "requests_deferred"},
                    {"help",  "Number of request deferred."},
-                    {"value",  (uint64_t) data["deferred"]}
+                    {"value",  (uint64_t) data.at("deferred")}
            }}}
        };

@@ -3264,8 +3281,8 @@ int main(int argc, char ** argv) {
            const auto & metrics_def = el.value();

            for (const auto & metric_def : metrics_def) {
-                const std::string name = metric_def["name"];
-                const std::string help = metric_def["help"];
+                const std::string name = metric_def.at("name");
+                const std::string help = metric_def.at("help");

                auto value = json_value(metric_def, "value", 0.);
                prometheus << "# HELP llamacpp:" << name << " " << help  << "\n"
@@ -3274,7 +3291,7 @@ int main(int argc, char ** argv) {
            }
        }

-        const int64_t t_start = data["t_start"];
+        const int64_t t_start = data.at("t_start");
        res.set_header("Process-Start-Time-Unix", std::to_string(t_start));

        res.set_content(prometheus.str(), "text/plain; version=0.0.4");
@@ -3283,7 +3300,7 @@ int main(int argc, char ** argv) {

    const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
        json request_data = json::parse(req.body);
-        std::string filename = request_data["filename"];
+        std::string filename = request_data.at("filename");
        if (!validate_file_name(filename)) {
            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
            return;
@@ -3313,7 +3330,7 @@ int main(int argc, char ** argv) {

    const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
        json request_data = json::parse(req.body);
-        std::string filename = request_data["filename"];
+        std::string filename = request_data.at("filename");
        if (!validate_file_name(filename)) {
            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
            return;
@@ -3632,7 +3649,8 @@ int main(int argc, char ** argv) {

        std::vector<llama_token> tokens;
        if (body.count("content") != 0) {
-            tokens = ctx_server.tokenize(body["content"], false);
+            const bool add_special = json_value(body, "add_special", false);
+            tokens = ctx_server.tokenize(body.at("content"), add_special);
        }
        const json data = format_tokenizer_response(tokens);
        return res.set_content(data.dump(), "application/json; charset=utf-8");
@@ -3644,7 +3662,7 @@ int main(int argc, char ** argv) {

        std::string content;
        if (body.count("tokens") != 0) {
-            const std::vector<llama_token> tokens = body["tokens"];
+            const std::vector<llama_token> tokens = body.at("tokens");
            content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
        }

@@ -3667,10 +3685,10 @@ int main(int argc, char ** argv) {
        json prompt;
        if (body.count("input") != 0) {
            is_openai = true;
-            prompt = body["input"];
+            prompt = body.at("input");
        } else if (body.count("content") != 0) {
            // with "content", we only support single prompt
-            prompt = std::vector<std::string>{body["content"]};
+            prompt = std::vector<std::string>{body.at("content")};
        } else {
            res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
            return;
@@ -3689,7 +3707,7 @@ int main(int argc, char ** argv) {
            if (!result.error) {
                if (result.data.count("results")) {
                    // result for multi-task
-                    responses = result.data["results"];
+                    responses = result.data.at("results");
                } else {
                    // result for single task
                    responses = std::vector<json>{result.data};
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -7,6 +7,7 @@ Feature: llama.cpp server
    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
    And   a model file test-model.gguf
    And   a model alias tinyllama-2
+    And   BOS token is 1
    And   42 as server seed
      # KV Cache corresponds to the total amount of tokens
      # that can be stored across all independent sequences: #4130
@@ -91,7 +92,18 @@ Feature: llama.cpp server
    """
    What is the capital of France ?
    """
-    Then tokens can be detokenize
+    Then tokens can be detokenized
+    And  tokens do not begin with BOS
+
+  Scenario: Tokenize w/ BOS
+    Given adding special tokens
+    When  tokenizing:
+    """
+    What is the capital of Germany?
+    """
+    Then  tokens begin with BOS
+    Given first token is removed
+    Then  tokens can be detokenized

  Scenario: Models available
    Given available models
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -376,6 +376,11 @@ def step_seed(context, seed):
        context.seed.append(seed)


+@step('BOS token is {bos:d}')
+def step_bos_token(context, bos):
+    context.bos = bos
+
+
@step('a prefix prompt')
 def step_prompt_prefix(context):
    context.prompt_prefix = context_text(context)
@@ -656,21 +661,29 @@ async def all_embeddings_are_generated(context):
        assert_embeddings(context.tasks_result.pop().pop())


+@step('adding special tokens')
+def step_tokenize_set_add_special(context):
+    context.tokenize_add_special = True
+
+
@step('tokenizing')
@async_run_until_complete
 async def step_tokenize(context):
    context.tokenized_text = context_text(context)
    async with aiohttp.ClientSession() as session:
+        tokenize_args = {
+            "content": context.tokenized_text,
+        }
+        if getattr(context, 'tokenize_add_special', None) is not None:
+            tokenize_args['add_special'] = context.tokenize_add_special
        async with session.post(f'{context.base_url}/tokenize',
-                                json={
-                                    "content": context.tokenized_text,
-                                }) as response:
+                                json=tokenize_args) as response:
            assert response.status == 200
            tokenize_json = await response.json()
            context.tokens = tokenize_json['tokens']


-@step('tokens can be detokenize')
+@step('tokens can be detokenized')
@async_run_until_complete
 async def step_detokenize(context):
    assert len(context.tokens) > 0
@@ -685,6 +698,21 @@ async def step_detokenize(context):
            assert context.tokenized_text == detokenize_json['content'].strip()


+@step('tokens begin with BOS')
+def step_strings_for_tokenization(context):
+    assert context.tokens[0] == context.bos
+
+
+@step('tokens do not begin with BOS')
+def step_strings_for_tokenization(context):
+    assert context.tokens[0] != context.bos
+
+
+@step('first token is removed')
+def step_strings_for_tokenization(context):
+    context.tokens = context.tokens[1:]
+
+
@step('an OPTIONS request is sent from {origin}')
@async_run_until_complete
 async def step_options_request(context, origin):
--- a/examples/server/themes/README.md
+++ b/examples/server/themes/README.md
@@ -0,0 +1,5 @@
+# LLaMA.cpp Server Wild Theme
+
+Simple themes directory of sample "public" directories. To try any of these add --path to your run like `server --path=wild`.
+
+![image](wild/wild.png)
--- a/examples/server/themes/buttons-top/README.md
+++ b/examples/server/themes/buttons-top/README.md
@@ -0,0 +1,7 @@
+# LLaMA.cpp Server Buttons Top Theme
+
+Simple tweaks to the UI. Chat buttons at the top of the page instead of bottom so you can hit Stop instead of chasing it down the page.
+
+To use simply run server with `--path=themes/buttons_top`
+
+![image](buttons_top.png)
--- a/examples/server/themes/buttons-top/buttons_top.png
+++ b/examples/server/themes/buttons-top/buttons_top.png
--- a/examples/server/themes/buttons-top/favicon.ico
+++ b/examples/server/themes/buttons-top/favicon.ico
--- a/examples/server/themes/buttons-top/index.html
+++ b/examples/server/themes/buttons-top/index.html
--- a/examples/server/themes/wild/README.md
+++ b/examples/server/themes/wild/README.md
@@ -0,0 +1,5 @@
+# LLaMA.cpp Server Wild Theme
+
+Simple tweaks to the UI. To use simply run server with `--path=themes/wild`
+
+![image](wild.png)
--- a/examples/server/themes/wild/favicon.ico
+++ b/examples/server/themes/wild/favicon.ico
--- a/examples/server/themes/wild/index.html
+++ b/examples/server/themes/wild/index.html
--- a/examples/server/themes/wild/llama_cpp.png
+++ b/examples/server/themes/wild/llama_cpp.png
--- a/examples/server/themes/wild/llamapattern.png
+++ b/examples/server/themes/wild/llamapattern.png
--- a/examples/server/themes/wild/wild.png
+++ b/examples/server/themes/wild/wild.png
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -3,6 +3,8 @@
 #include "llama.h"
 #include "common.h"

+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"

 #include <string>
@@ -49,18 +51,18 @@ extern bool server_log_json;
 #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)

-static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra);
+static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);

 template <typename T>
-static T json_value(const json &body, const std::string &key, const T &default_value) {
+static T json_value(const json & body, const std::string & key, const T & default_value) {
    // Fallback null to default value
-    if (body.contains(key) && !body.at(key).is_null()){
+    if (body.contains(key) && !body.at(key).is_null()) {
        try {
-            return body.value(key, default_value);
-        }
-        catch (nlohmann::json_abi_v3_11_3::detail::type_error const&){
-            std::string message = "Wrong type supplied for parameter '" + key + "'. Expected '" + typeid(default_value).name() + "', using default value.";
-            server_log("WARN", __func__, __LINE__, message.c_str(), body);
+            return body.at(key);
+        } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
+            std::stringstream ss;
+            ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
+            LOG_WARNING(ss.str().c_str(), body);
            return default_value;
        }
    } else {
@@ -68,16 +70,16 @@ static T json_value(const json &body, const std::string &key, const T &default_v
    }
 }

-static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
+static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
    std::stringstream ss_tid;
    ss_tid << std::this_thread::get_id();
-    json log = nlohmann::ordered_json{
+    json log = json{
        {"tid",       ss_tid.str()},
        {"timestamp", time(nullptr)},
    };

    if (server_log_json) {
-        log.merge_patch( {
+        log.merge_patch({
            {"level",    level},
            {"function", function},
            {"line",     line},
@@ -98,7 +100,7 @@ static inline void server_log(const char *level, const char *function, int line,
        }
        std::stringstream ss;
        ss << buf << " |";
-        for (const auto& el : log.items())
+        for (const auto & el : log.items())
        {
            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
            ss << " " << el.key() << "=" << value;
@@ -373,11 +375,11 @@ static json oaicompat_completion_params_parse(
    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);

    // Apply chat template to the list of messages
-    llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
+    llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));

    // Handle "stop" field
-    if (body.contains("stop") && body["stop"].is_string()) {
-        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
+    if (body.contains("stop") && body.at("stop").is_string()) {
+        llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
    } else {
        llama_params["stop"] = json_value(body, "stop", json::array());
    }
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@@ -1,6 +1,6 @@
 # llama.cpp/example/sycl

-This example program provide the tools for llama.cpp for SYCL on Intel GPU.
+This example program provides the tools for llama.cpp for SYCL on Intel GPU.

 ## Tool

--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1712014858,
-        "narHash": "sha256-sB4SWl2lX95bExY2gMFG5HIzvva5AVMJd4Igm+GpZNw=",
+        "lastModified": 1714641030,
+        "narHash": "sha256-yzcRNDoyVP7+SCNX0wmuDju1NUCt8Dz9+lyUXEI0dbI=",
        "owner": "hercules-ci",
        "repo": "flake-parts",
-        "rev": "9126214d0a59633752a136528f5f3b9aa8565b7d",
+        "rev": "e5d10a24b66c3ea8f150e47dfdb0416ab7c3390e",
        "type": "github"
      },
      "original": {
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1714076141,
-        "narHash": "sha256-Drmja/f5MRHZCskS6mvzFqxEaZMeciScCTFxWVLqWEY=",
+        "lastModified": 1714635257,
+        "narHash": "sha256-4cPymbty65RvF1DWQfc+Bc8B233A1BWxJnNULJKQ1EY=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "7bb2ccd8cdc44c91edba16c48d2c8f331fb3d856",
+        "rev": "63c3a29ca82437c87573e4c6919b09a24ea61b0f",
        "type": "github"
      },
      "original": {
@@ -36,20 +36,14 @@
    },
    "nixpkgs-lib": {
      "locked": {
-        "dir": "lib",
-        "lastModified": 1711703276,
-        "narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
-        "owner": "NixOS",
-        "repo": "nixpkgs",
-        "rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
-        "type": "github"
+        "lastModified": 1714640452,
+        "narHash": "sha256-QBx10+k6JWz6u7VsohfSw8g8hjdBZEf8CFzXH1/1Z94=",
+        "type": "tarball",
+        "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz"
      },
      "original": {
-        "dir": "lib",
-        "owner": "NixOS",
-        "ref": "nixos-unstable",
-        "repo": "nixpkgs",
-        "type": "github"
+        "type": "tarball",
+        "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz"
      }
    },
    "root": {
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -113,7 +113,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
    for (int id = 0; id < info.device_count; ++id) {
        int device_vmm = 0;

-#if !defined(GGML_USE_HIPBLAS)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
        CUdevice device;
        CU_CHECK(cuDeviceGet(&device, id));
        CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -259,7 +259,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
 };

 // pool with virtual memory
-#if !defined(GGML_USE_HIPBLAS)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
 struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
    static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB

@@ -356,7 +356,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
 #endif // !defined(GGML_USE_HIPBLAS)

 std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
-#if !defined(GGML_USE_HIPBLAS)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
    if (ggml_cuda_info().devices[device].vmm) {
        return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
    }
--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -17,6 +17,83 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))

+/**
+ * Converts brain16 to float32.
+ *
+ * The bfloat16 floating point format has the following structure:
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───┐
+ *     0b0000000000000000 brain16
+ *
+ * Since bf16 has the same number of exponent bits as a 32bit float,
+ * encoding and decoding numbers becomes relatively straightforward.
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───────────────────┐
+ *     0b00000000000000000000000000000000 IEEE binary32
+ *
+ * For comparison, the standard fp16 format has fewer exponent bits.
+ *
+ *       ┌sign
+ *       │
+ *       │  ┌exponent
+ *       │  │
+ *       │  │    ┌mantissa
+ *       │  │    │
+ *       │┌─┴─┐┌─┴──────┐
+ *     0b0000000000000000 IEEE binary16
+ *
+ * @see IEEE 754-2008
+ */
+static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.i = (uint32_t)h.bits << 16;
+    return u.f;
+}
+
+/**
+ * Converts float32 to brain16.
+ *
+ * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
+ * Subnormals shall be flushed to zero, and NANs will be quiet.
+ * This code should vectorize nicely if using modern compilers.
+ */
+static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
+    ggml_bf16_t h;
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.f = s;
+    if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
+        h.bits = (u.i >> 16) | 64; /* force to quiet */
+        return h;
+    }
+    if (!(u.i & 0x7f800000)) { /* subnormal */
+        h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
+        return h;
+    }
+    h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
+    return h;
+}
+
+#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
+#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
+
 #ifdef __cplusplus
 extern "C" {
 #endif
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -265,11 +265,20 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){

 static void * ggml_metal_host_malloc(size_t n) {
    void * data = NULL;
+
+#if TARGET_OS_OSX
+    kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
+    if (err != KERN_SUCCESS) {
+        GGML_METAL_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
+        return NULL;
+    }
+#else
    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
    if (result != 0) {
        GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
        return NULL;
    }
+#endif

    return data;
 }
@@ -803,7 +812,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_GET_ROWS:
            {
-                return op->ne[3] == 1;
+                return op->src[0]->type != GGML_TYPE_BF16 && op->ne[3] == 1;
            }
        default:
            return false;
@@ -2840,7 +2849,11 @@ GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_
    ggml_backend_metal_free_device();

    if (ctx->owned) {
+#if TARGET_OS_OSX
+        vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size);
+#else
        free(ctx->all_data);
+#endif
    }

    free(ctx);
@@ -2944,14 +2957,16 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
    ctx->owned = true;
    ctx->n_buffers = 1;

-    ctx->buffers[0].data = ctx->all_data;
-    ctx->buffers[0].size = size;
-    ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
-                    length:size_aligned
-                    options:MTLResourceStorageModeShared
-                    deallocator:nil];
+    if (ctx->all_data != NULL) {
+        ctx->buffers[0].data = ctx->all_data;
+        ctx->buffers[0].size = size;
+        ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
+                        length:size_aligned
+                        options:MTLResourceStorageModeShared
+                        deallocator:nil];
+    }

-    if (ctx->buffers[0].metal == nil) {
+    if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) {
        GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
        free(ctx);
        ggml_backend_metal_free_device();
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -2175,7 +2175,7 @@ kernel void kernel_flash_attn_ext_f16(

    const short D4 = D/4;
    const short D8 = D/8;
-    const short Q8 = Q/8;
+  //const short Q8 = Q/8;
    const short NW = N_SIMDWIDTH;
    const short SH = (C + Q); // shared memory per simdgroup in (half)

--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -12450,6 +12450,24 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
    const size_t nb = nbytes/ggml_type_size(type);

    switch (type) {
+        case GGML_TYPE_BF16:
+            {
+                int nans = 0;
+                int infs = 0;
+                const unsigned short * f = (const unsigned short *) data;
+                for (size_t i = 0; i < nb; ++i) {
+                    nans += (f[i] & 0x7fff) > 0x7f80;
+                    infs += (f[i] & 0x7fff) == 0x7f80;
+                }
+                if (nans) {
+                    fprintf(stderr, "%s: found %d NaNs in row of %zu BF16 values\n", __func__, nans, nb);
+                    return false;
+                }
+                if (infs) {
+                    fprintf(stderr, "%s: found %d infinities in row of %zu BF16 values\n", __func__, infs, nb);
+                    return false;
+                }
+            } break;
        case GGML_TYPE_F16:
            {
                const ggml_fp16_t * f = (const ggml_fp16_t *) data;
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -326,14 +326,20 @@ extern "C" {
    // get ggml_status name string
    GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);

+    // ieee 754-2008 half-precision float16
+    // todo: make this not an integral type
    typedef uint16_t ggml_fp16_t;
+    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t);
+    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
+    GGML_API void        ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
+    GGML_API void        ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);

-    // convert FP16 <-> FP32
-    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
-    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
-
-    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
-    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
+    // google brain half-precision bfloat16
+    typedef struct { uint16_t bits; } ggml_bf16_t;
+    GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
+    GGML_API float       ggml_bf16_to_fp32(ggml_bf16_t);  // consider just doing << 16
+    GGML_API void        ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
+    GGML_API void        ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);

    struct ggml_object;
    struct ggml_context;
@@ -370,6 +376,7 @@ extern "C" {
        GGML_TYPE_I64     = 27,
        GGML_TYPE_F64     = 28,
        GGML_TYPE_IQ1_M   = 29,
+        GGML_TYPE_BF16    = 30,
        GGML_TYPE_COUNT,
    };

@@ -410,6 +417,7 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ2_S   = 21, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
+        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
    };

    // available tensor operations:
--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
@@ -1,11 +1,14 @@
 #!/usr/bin/env python

+import logging
 import argparse
 import asyncio
 import os
 import sys
 from tempfile import gettempdir, NamedTemporaryFile

+logger = logging.getLogger("ggml-vk-generate-shaders")
+
 shader_f32 = """
 #define FLOAT_TYPE float
 """
@@ -2498,7 +2501,7 @@ async def string_to_spv(name, code, defines, fp16=True):

        stdout, stderr = await proc.communicate()

-        print(" ".join(cmd))
+        logger.info(" ".join(cmd))

        if proc.returncode:
            raise RuntimeError(f"{name=} {f.name=} {stdout=} {stderr=}")
@@ -2507,7 +2510,7 @@ async def string_to_spv(name, code, defines, fp16=True):

        cmd.extend([f"-D{key}={value}" for key, value in defines.items()])
        code_with_lines = "\n".join([f"{i + 1}: {line}" for i, line in enumerate(preprocessed_code.splitlines())])
-        print(f"ERROR compiling {name}\n\n{code_with_lines}\n\n{error}")
+        logger.error(f"cannot compile {name}\n\n{code_with_lines}\n\n{error}")
        f.close()
        os.remove(f.name)
        sys.exit(proc.returncode)
@@ -2520,7 +2523,7 @@ async def string_to_spv(name, code, defines, fp16=True):


 async def main():
-    print("ggml_vulkan: Generating and compiling shaders to SPIR-V")
+    logger.info("ggml_vulkan: Generating and compiling shaders to SPIR-V")

    tasks = []

@@ -2768,9 +2771,12 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator")

    parser.add_argument("--glslc", help="Path to glslc")
+    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")

    args = parser.parse_args()

+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
    if args.glslc:
        GLSLC = args.glslc

--- a/gguf-py/examples/reader.py
+++ b/gguf-py/examples/reader.py
@@ -1,8 +1,10 @@
 #!/usr/bin/env python3
+import logging
 import sys
 from pathlib import Path
 from gguf.gguf_reader import GGUFReader

+logger = logging.getLogger("reader")

 sys.path.insert(0, str(Path(__file__).parent.parent))

@@ -18,28 +20,28 @@ def read_gguf_file(gguf_file_path):
    reader = GGUFReader(gguf_file_path)

    # List all key-value pairs in a columnized format
-    print("Key-Value Pairs:")
+    print("Key-Value Pairs:") # noqa: NP100
    max_key_length = max(len(key) for key in reader.fields.keys())
    for key, field in reader.fields.items():
        value = field.parts[field.data[0]]
-        print(f"{key:{max_key_length}} : {value}")
-    print("----")
+        print(f"{key:{max_key_length}} : {value}") # noqa: NP100
+    print("----") # noqa: NP100

    # List all tensors
-    print("Tensors:")
+    print("Tensors:") # noqa: NP100
    tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
-    print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization"))
-    print("-" * 80)
+    print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100
+    print("-" * 80) # noqa: NP100
    for tensor in reader.tensors:
        shape_str = "x".join(map(str, tensor.shape))
        size_str = str(tensor.n_elements)
        quantization_str = tensor.tensor_type.name
-        print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str))
+        print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100


 if __name__ == '__main__':
    if len(sys.argv) < 2:
-        print("Usage: reader.py <path_to_gguf_file>")
+        logger.info("Usage: reader.py <path_to_gguf_file>")
        sys.exit(1)
    gguf_file_path = sys.argv[1]
    read_gguf_file(gguf_file_path)
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -1,6 +1,5 @@
 from __future__ import annotations

-import sys
 from enum import Enum, IntEnum, auto
 from typing import Any

@@ -818,6 +817,7 @@ class GGMLQuantizationType(IntEnum):
    I64     = 27
    F64     = 28
    IQ1_M   = 29
+    BF16    = 30


 class GGUFEndian(IntEnum):
@@ -854,8 +854,7 @@ class GGUFValueType(IntEnum):
            return GGUFValueType.INT32
        # TODO: need help with 64-bit types in Python
        else:
-            print("Unknown type:", type(val))
-            sys.exit()
+            raise ValueError(f"Unknown type: {type(val)}")


 # Note: Does not support GGML_QKK_64
@@ -890,6 +889,7 @@ GGML_QUANT_SIZES = {
    GGMLQuantizationType.I64:     (1, 8),
    GGMLQuantizationType.F64:     (1, 8),
    GGMLQuantizationType.IQ1_M:   (256, QK_K // 8 + QK_K // 16  + QK_K // 32),
+    GGMLQuantizationType.BF16:    (1, 2),
 }


--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -4,6 +4,7 @@
 #
 from __future__ import annotations

+import logging
 import os
 from collections import OrderedDict
 from typing import Any, Literal, NamedTuple, TypeVar, Union
@@ -27,6 +28,7 @@ from gguf.constants import (
    GGUFValueType,
 )

+logger = logging.getLogger(__name__)

 READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION]

@@ -142,7 +144,7 @@ class GGUFReader:
            # TODO: add option to generate error on duplicate keys
            # raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')

-            print(f'Warning: Duplicate key {field.name} at offset {field.offset}')
+            logger.warning(f'Duplicate key {field.name} at offset {field.offset}')
            self.fields[field.name + '_{}'.format(field.offset)] = field
        else:
            self.fields[field.name] = field
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import logging
 import os
 import shutil
 import struct
@@ -24,6 +25,8 @@ from .constants import (
    TokenType,
 )

+logger = logging.getLogger(__name__)
+

 class WriterState(Enum):
    EMPTY   = auto()
@@ -67,7 +70,7 @@ class GGUFWriter:
        self.use_temp_file = use_temp_file
        self.temp_file = None
        self.tensors = []
-        print("gguf: This GGUF file is for {0} Endian only".format(
+        logger.info("gguf: This GGUF file is for {0} Endian only".format(
            "Big" if self.endianess == GGUFEndian.BIG else "Little",
        ))
        self.state = WriterState.EMPTY
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -1,13 +1,15 @@
 from __future__ import annotations

+import logging
 import json
 import os
-import sys
 from pathlib import Path
 from typing import Any, Callable

 from .gguf_writer import GGUFWriter

+logger = logging.getLogger(__name__)
+

 class SpecialVocab:
    merges: list[str]
@@ -40,38 +42,29 @@ class SpecialVocab:
    def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
        if self.merges:
            if not quiet:
-                print(f'gguf: Adding {len(self.merges)} merge(s).')
+                logger.info(f'Adding {len(self.merges)} merge(s).')
            gw.add_token_merges(self.merges)
        elif self.load_merges:
-            print(
-                'gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.',
-                file = sys.stderr,
-            )
+            logger.warning('Adding merges requested but no merges found, output may be non-functional.')
        for typ, tokid in self.special_token_ids.items():
            id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
            if id_handler is None:
-                print(
-                    f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping',
-                    file = sys.stderr,
-                )
+                logger.warning(f'No handler for special token type {typ} with id {tokid} - skipping')
                continue
            if not quiet:
-                print(f'gguf: Setting special token type {typ} to {tokid}')
+                logger.info(f'Setting special token type {typ} to {tokid}')
            id_handler(tokid)
        for typ, value in self.add_special_token.items():
            add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None)
            if add_handler is None:
-                print(
-                    f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping',
-                    file = sys.stderr,
-                )
+                logger.warning(f'No handler for add_{typ}_token with value {value} - skipping')
                continue
            if not quiet:
-                print(f'gguf: Setting add_{typ}_token to {value}')
+                logger.info(f'Setting add_{typ}_token to {value}')
            add_handler(value)
        if self.chat_template is not None:
            if not quiet:
-                print(f'gguf: Setting chat_template to {self.chat_template}')
+                logger.info(f'Setting chat_template to {self.chat_template}')
            gw.add_chat_template(self.chat_template)

    def _load(self, path: Path) -> None:
@@ -99,10 +92,7 @@ class SpecialVocab:
                    continue
                parts = line.split(None, 3)
                if len(parts) != 2:
-                    print(
-                        f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring',
-                        file = sys.stderr,
-                    )
+                    logger.warning(f'{merges_file.name}: Line {line_num}: Entry malformed, ignoring')
                    continue
                merges.append(f'{parts[0]} {parts[1]}')
        self.merges = merges
@@ -118,10 +108,7 @@ class SpecialVocab:
                return
            self.special_token_ids[typ] = tid
            return
-        print(
-            f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping',
-            file = sys.stderr,
-        )
+        logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')

    def _try_load_from_tokenizer_json(self, path: Path) -> bool:
        tokenizer_file = path / 'tokenizer.json'
@@ -144,10 +131,7 @@ class SpecialVocab:
        if chat_template is None or isinstance(chat_template, (str, list)):
            self.chat_template = chat_template
        else:
-            print(
-                f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring',
-                file = sys.stderr
-            )
+            logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring')
        for typ in self.special_token_types:
            add_entry = tokenizer_config.get(f'add_{typ}_token')
            if isinstance(add_entry, bool):
--- a/gguf-py/scripts/gguf-convert-endian.py
+++ b/gguf-py/scripts/gguf-convert-endian.py
@@ -1,9 +1,11 @@
 #!/usr/bin/env python3
 from __future__ import annotations

+import logging
 import argparse
 import os
 import sys
+from tqdm import tqdm
 from pathlib import Path

 import numpy as np
@@ -14,6 +16,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent /

 import gguf

+logger = logging.getLogger("gguf-convert-endian")
+

 def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None:
    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
@@ -29,11 +33,11 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
    else:
        file_endian = host_endian
    order = host_endian if args.order == "native" else args.order
-    print(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian")
+    logger.info(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian")
    if file_endian == order:
-        print(f"* File is already {order.upper()} endian. Nothing to do.")
+        logger.info(f"* File is already {order.upper()} endian. Nothing to do.")
        sys.exit(0)
-    print("* Checking tensors for conversion compatibility")
+    logger.info("* Checking tensors for conversion compatibility")
    for tensor in reader.tensors:
        if tensor.tensor_type not in (
            gguf.GGMLQuantizationType.F32,
@@ -41,51 +45,64 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
            gguf.GGMLQuantizationType.Q8_0,
        ):
            raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}")
-    print(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}")
+    logger.info(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}")
    if args.dry_run:
        return
-    print("\n*** Warning *** Warning *** Warning **")
-    print("* This conversion process may damage the file. Ensure you have a backup.")
+    logger.warning("*** Warning *** Warning *** Warning **")
+    logger.warning("* This conversion process may damage the file. Ensure you have a backup.")
    if order != host_endian:
-        print("* Requested endian differs from host, you will not be able to load the model on this machine.")
-    print("* The file will be modified immediately, so if conversion fails or is interrupted")
-    print("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:")
+        logger.warning("* Requested endian differs from host, you will not be able to load the model on this machine.")
+    logger.warning("* The file will be modified immediately, so if conversion fails or is interrupted")
+    logger.warning("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:")
    response = input("YES, I am sure> ")
    if response != "YES":
-        print("You didn't enter YES. Okay then, see ya!")
+        logger.warning("You didn't enter YES. Okay then, see ya!")
        sys.exit(0)
-    print(f"\n* Converting fields ({len(reader.fields)})")
+    logger.info(f"* Converting fields ({len(reader.fields)})")
    for idx, field in enumerate(reader.fields.values()):
-        print(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}")
+        logger.info(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}")
        for part in field.parts:
            part.byteswap(inplace=True)
-    print(f"\n* Converting tensors ({len(reader.tensors)})")
-    for idx, tensor in enumerate(reader.tensors):
-        print(
-            f"  - {idx:4}: Converting tensor {repr(tensor.name)}, type={tensor.tensor_type.name}, "
-            f"elements={tensor.n_elements}... ",
-            end="",
+    logger.info(f"* Converting tensors ({len(reader.tensors)})")
+
+    for idx, tensor in enumerate(pbar := tqdm(reader.tensors, desc="Converting tensor")):
+        log_message = (
+            f"Converting tensor {repr(tensor.name)}, "
+            f"type={tensor.tensor_type.name}, "
+            f"elements={tensor.n_elements} "
        )
-        tensor_type = tensor.tensor_type
+
+        # Byte-swap each part of the tensor's field
        for part in tensor.field.parts:
            part.byteswap(inplace=True)
-        if tensor_type != gguf.GGMLQuantizationType.Q8_0:
+
+        # Byte-swap tensor data if necessary
+        if tensor.tensor_type == gguf.GGMLQuantizationType.Q8_0:
+            # Handle Q8_0 tensor blocks (block_q8_0)
+            # Specific handling of block_q8_0 is required.
+            # Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations.
+
+            block_size = 34 # 34 bytes = <f16 delta scaling factor> + 32 * <int8 quant>
+
+            n_blocks = len(tensor.data) // block_size
+            for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
+                block_offs = block_num * block_size
+
+                # Byte-Swap f16 sized delta field
+                delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
+                delta.byteswap(inplace=True)
+
+                # Byte-Swap Q8 weights
+                if block_num % 100000 == 0:
+                    inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
+
+        else:
+            # Handle other tensor types
            tensor.data.byteswap(inplace=True)
-            print()
-            continue
-        # A Q8_0 block consists of a f16 delta followed by 32 int8 quants, so 34 bytes
-        block_size = 34
-        n_blocks = len(tensor.data) // block_size
-        for block_num in range(n_blocks):
-            block_offs = block_num * block_size
-            # I know I said f16, but it doesn't matter here - any simple 16 bit type works.
-            delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
-            delta.byteswap(inplace=True)
-            if block_num % 100000 == 0:
-                print(f"[{(n_blocks - block_num) // 1000}K]", end="")
-                sys.stdout.flush()
-        print()
-    print("* Completion")
+
+        pbar.set_description(log_message)
+
+    logger.info("* Completion")


 def main() -> None:
@@ -102,8 +119,13 @@ def main() -> None:
        "--dry-run", action="store_true",
        help="Don't actually change anything",
    )
+    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
+
    args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
-    print(f'* Loading: {args.model}')
+
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    logger.info(f'* Loading: {args.model}')
    reader = gguf.GGUFReader(args.model, 'r' if args.dry_run else 'r+')
    convert_byteorder(reader, args)

--- a/gguf-py/scripts/gguf-dump.py
+++ b/gguf-py/scripts/gguf-dump.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 from __future__ import annotations

+import logging
 import argparse
 import os
 import sys
@@ -15,6 +16,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent /

 from gguf import GGUFReader, GGUFValueType  # noqa: E402

+logger = logging.getLogger("gguf-dump")
+

 def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
    host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG'
@@ -29,8 +32,8 @@ def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
 # please see the comments in the modify_gguf.py example.
 def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
    host_endian, file_endian = get_file_host_endian(reader)
-    print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.')
-    print(f'\n* Dumping {len(reader.fields)} key/value pair(s)')
+    print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.')  # noqa: NP100
+    print(f'* Dumping {len(reader.fields)} key/value pair(s)')  # noqa: NP100
    for n, field in enumerate(reader.fields.values(), 1):
        if not field.types:
            pretty_type = 'N/A'
@@ -39,20 +42,21 @@ def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
            pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count
        else:
            pretty_type = str(field.types[-1].name)
-        print(f'  {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}', end = '')
+
+        log_message = f'  {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}'
        if len(field.types) == 1:
            curr_type = field.types[0]
            if curr_type == GGUFValueType.STRING:
-                print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '')
+                log_message += ' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60]))
            elif field.types[0] in reader.gguf_scalar_to_np:
-                print(' = {0}'.format(field.parts[-1][0]), end = '')
-        print()
+                log_message += ' = {0}'.format(field.parts[-1][0])
+        print(log_message)  # noqa: NP100
    if args.no_tensors:
        return
-    print(f'\n* Dumping {len(reader.tensors)} tensor(s)')
+    print(f'* Dumping {len(reader.tensors)} tensor(s)')  # noqa: NP100
    for n, tensor in enumerate(reader.tensors, 1):
        prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape)))
-        print(f'  {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}')
+        print(f'  {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}')  # noqa: NP100


 def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
@@ -103,10 +107,17 @@ def main() -> None:
    parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
    parser.add_argument("--json",       action="store_true", help="Produce JSON output")
    parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
+    parser.add_argument("--verbose",    action="store_true", help="increase output verbosity")
+
    args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
+
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
    if not args.json:
-        print(f'* Loading: {args.model}')
+        logger.info(f'* Loading: {args.model}')
+
    reader = GGUFReader(args.model, 'r')
+
    if args.json:
        dump_metadata_json(reader, args)
    else:
--- a/gguf-py/scripts/gguf-set-metadata.py
+++ b/gguf-py/scripts/gguf-set-metadata.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import logging
 import argparse
 import os
 import sys
@@ -10,6 +11,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent /

 from gguf import GGUFReader  # noqa: E402

+logger = logging.getLogger("gguf-set-metadata")
+

 def minimal_example(filename: str) -> None:
    reader = GGUFReader(filename, 'r+')
@@ -41,36 +44,33 @@ def minimal_example(filename: str) -> None:
 def set_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
    field = reader.get_field(args.key)
    if field is None:
-        print(f'! Field {repr(args.key)} not found', file = sys.stderr)
+        logger.error(f'! Field {repr(args.key)} not found')
        sys.exit(1)
    # Note that field.types is a list of types. This is because the GGUF
    # format supports arrays. For example, an array of UINT32 would
    # look like [GGUFValueType.ARRAY, GGUFValueType.UINT32]
    handler = reader.gguf_scalar_to_np.get(field.types[0]) if field.types else None
    if handler is None:
-        print(
-            f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}',
-            file = sys.stderr,
-        )
+        logger.error(f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}')
        sys.exit(1)
    current_value = field.parts[field.data[0]][0]
    new_value = handler(args.value)
-    print(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}')
+    logger.info(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}')
    if current_value == new_value:
-        print(f'- Key {repr(args.key)} already set to requested value {current_value}')
+        logger.info(f'- Key {repr(args.key)} already set to requested value {current_value}')
        sys.exit(0)
    if args.dry_run:
        sys.exit(0)
    if not args.force:
-        print('*** Warning *** Warning *** Warning **')
-        print('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.')
-        print('* Enter exactly YES if you are positive you want to proceed:')
+        logger.warning('*** Warning *** Warning *** Warning **')
+        logger.warning('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.')
+        logger.warning('* Enter exactly YES if you are positive you want to proceed:')
        response = input('YES, I am sure> ')
        if response != 'YES':
-            print("You didn't enter YES. Okay then, see ya!")
+            logger.info("You didn't enter YES. Okay then, see ya!")
            sys.exit(0)
    field.parts[field.data[0]][0] = new_value
-    print('* Field changed. Successful completion.')
+    logger.info('* Field changed. Successful completion.')


 def main() -> None:
@@ -80,8 +80,13 @@ def main() -> None:
    parser.add_argument("value",     type=str,            help="Metadata value to set")
    parser.add_argument("--dry-run", action="store_true", help="Don't actually change anything")
    parser.add_argument("--force",   action="store_true", help="Change the field without confirmation")
+    parser.add_argument("--verbose",      action="store_true",    help="increase output verbosity")
+
    args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
-    print(f'* Loading: {args.model}')
+
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    logger.info(f'* Loading: {args.model}')
    reader = GGUFReader(args.model, 'r' if args.dry_run else 'r+')
    set_metadata(reader, args)

--- a/grammars/README.md
+++ b/grammars/README.md
@@ -51,7 +51,7 @@ single-line ::= [^\n]+ "\n"`

 ## Sequences and Alternatives

-The order of symbols in a sequence matter. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
+The order of symbols in a sequence matters. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.

 Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`.

--- a/llama.cpp
+++ b/llama.cpp
@@ -3175,6 +3175,7 @@ struct llama_model_loader {
            switch (type_max) {
                case GGML_TYPE_F32:     ftype = LLAMA_FTYPE_ALL_F32;        break;
                case GGML_TYPE_F16:     ftype = LLAMA_FTYPE_MOSTLY_F16;     break;
+                case GGML_TYPE_BF16:    ftype = LLAMA_FTYPE_MOSTLY_BF16;    break;
                case GGML_TYPE_Q4_0:    ftype = LLAMA_FTYPE_MOSTLY_Q4_0;    break;
                case GGML_TYPE_Q4_1:    ftype = LLAMA_FTYPE_MOSTLY_Q4_1;    break;
                case GGML_TYPE_Q5_0:    ftype = LLAMA_FTYPE_MOSTLY_Q5_0;    break;
@@ -3666,6 +3667,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
    switch (ftype) {
        case LLAMA_FTYPE_ALL_F32:     return "all F32";
        case LLAMA_FTYPE_MOSTLY_F16:  return "F16";
+        case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
        case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
        case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
        case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
@@ -4383,6 +4385,21 @@ static void llm_load_vocab(
            } else if (
                    tokenizer_pre == "gpt-2") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
+            } else if (
+                    tokenizer_pre == "refact") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
+            } else if (
+                tokenizer_pre == "command-r") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
+            } else if (
+                tokenizer_pre == "qwen2") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+            } else if (
+                tokenizer_pre == "olmo") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
+            } else if (
+                tokenizer_pre == "dbrx") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
@@ -6120,6 +6137,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
            || !(
                model.ftype == LLAMA_FTYPE_ALL_F32 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
+                model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
            )
@@ -11952,7 +11970,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
 static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
    GGML_ASSERT(llama_is_byte_token(vocab, id));
-    const auto& token_data = vocab.id_to_token.at(id);
+    const auto & token_data = vocab.id_to_token.at(id);
    switch (llama_vocab_get_type(vocab)) {
        case LLAMA_VOCAB_TYPE_SPM: {
            auto buf = token_data.text.substr(3, 2);
@@ -12188,6 +12206,7 @@ struct llm_tokenizer_bpe {
            case LLAMA_VOCAB_TYPE_BPE:
                switch (vocab.type_pre) {
                    case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
+                    case LLAMA_VOCAB_PRE_TYPE_DBRX:
                        word_collection = unicode_regex_split(text, {
                            // original regex from tokenizer.json
                            //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12212,14 +12231,13 @@ struct llm_tokenizer_bpe {
                            "\\s?\\p{L}+",
                            "\\s?\\p{P}+",
                            "[一-龥ࠀ-一가-퟿]+",
-                            "\\p{N}+",
+                            "\\p{N}",
                        });
                        break;
                    case LLAMA_VOCAB_PRE_TYPE_FALCON:
                        word_collection = unicode_regex_split(text, {
                            "[\\p{P}\\$\\+<=>\\^~\\|]+",
                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
-                            "\\p{N}+",
                            "[0-9][0-9][0-9]",
                        });
                        break;
@@ -12235,11 +12253,26 @@ struct llm_tokenizer_bpe {
                        });
                        break;
                    case LLAMA_VOCAB_PRE_TYPE_STARCODER:
+                    case LLAMA_VOCAB_PRE_TYPE_REFACT:
+                    case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
+                        word_collection = unicode_regex_split(text, {
+                            "\\p{N}",
+                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                        });
+                        break;
                    case LLAMA_VOCAB_PRE_TYPE_GPT2:
+                    case LLAMA_VOCAB_PRE_TYPE_OLMO:
                        word_collection = unicode_regex_split(text, {
                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                        });
                        break;
+                    case LLAMA_VOCAB_PRE_TYPE_QWEN2:
+                        word_collection = unicode_regex_split(text, {
+                            // original regex from tokenizer.json
+                            // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+                            "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                        });
+                        break;
                    default:
                        // default regex for BPE tokenization pre-processing
                        word_collection = unicode_regex_split(text, {
@@ -14142,13 +14175,16 @@ static void llama_tensor_dequantize_internal(
        if (qtype.to_float == NULL) {
            throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
        }
-    } else if (tensor->type != GGML_TYPE_F16) {
+    } else if (tensor->type != GGML_TYPE_F16 &&
+               tensor->type != GGML_TYPE_BF16) {
        throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
    }

    if (nthread < 2) {
        if (tensor->type == GGML_TYPE_F16) {
            ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
+        } else if (tensor->type == GGML_TYPE_BF16) {
+            ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
        } else if (ggml_is_quantized(tensor->type)) {
            qtype.to_float(tensor->data, f32_output, nelements);
        } else {
@@ -14157,7 +14193,14 @@ static void llama_tensor_dequantize_internal(
        return;
    }

-    size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
+    size_t block_size;
+    if (tensor->type == GGML_TYPE_F16 ||
+        tensor->type == GGML_TYPE_BF16) {
+        block_size = 1;
+    } else {
+        block_size = (size_t)ggml_blck_size(tensor->type);
+    }
+
    size_t block_size_bytes = ggml_type_size(tensor->type);

    GGML_ASSERT(nelements % block_size == 0);
@@ -14176,6 +14219,8 @@ static void llama_tensor_dequantize_internal(
        auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
            if (typ == GGML_TYPE_F16) {
                ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
+            } else if (typ == GGML_TYPE_BF16) {
+                ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
            } else {
                qtype.to_float(inbuf, outbuf, nels);
            }
@@ -14536,6 +14581,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
        case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
        case LLAMA_FTYPE_MOSTLY_F16:  default_type = GGML_TYPE_F16;  break;
+        case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
        case LLAMA_FTYPE_ALL_F32:     default_type = GGML_TYPE_F32;  break;

        // K-quants
@@ -17466,9 +17512,10 @@ int32_t llama_tokenize(

 static std::string llama_decode_text(const std::string & text) {
    std::string decoded_text;
-    auto unicode_sequences = unicode_cpts_from_utf8(text);
-    for (auto & unicode_sequence : unicode_sequences) {
-        decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
+
+    const auto cpts = unicode_cpts_from_utf8(text);
+    for (const auto cpt : cpts) {
+        decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
    }

    return decoded_text;
--- a/llama.h
+++ b/llama.h
@@ -79,6 +79,11 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
        LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
        LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
+        LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
+        LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
+        LLAMA_VOCAB_PRE_TYPE_QWEN2          = 10,
+        LLAMA_VOCAB_PRE_TYPE_OLMO           = 11,
+        LLAMA_VOCAB_PRE_TYPE_DBRX           = 12,
    };

    // note: these values should be synchronized with ggml_rope
@@ -134,6 +139,7 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors

        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
@@ -171,7 +177,7 @@ extern "C" {
        bool sorted;
    } llama_token_data_array;

-    typedef bool (*llama_progress_callback)(float progress, void *ctx);
+    typedef bool (*llama_progress_callback)(float progress, void * user_data);

    // Input data for llama_decode
    // A llama_batch object can contain input about one or many sequences
--- a/models/ggml-vocab-bert-bge.gguf.inp
+++ b/models/ggml-vocab-bert-bge.gguf.inp
@@ -1,3 +1,7 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__

 __ggml_vocab_test__
 
--- a/models/ggml-vocab-bert-bge.gguf.out
+++ b/models/ggml-vocab-bert-bge.gguf.out
@@ -1,3 +1,5 @@
+ 29464 2094 1018 1092 2706
+ 11865 17875



--- a/models/ggml-vocab-command-r.gguf
+++ b/models/ggml-vocab-command-r.gguf
--- a/models/ggml-vocab-command-r.gguf.inp
+++ b/models/ggml-vocab-command-r.gguf.inp
@@ -0,0 +1,106 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
--- a/models/ggml-vocab-command-r.gguf.out
+++ b/models/ggml-vocab-command-r.gguf.out
@@ -0,0 +1,43 @@
+ 2536 228 27 228 22957 6983
+ 45 193433
+
+ 228
+ 1667
+ 1742
+ 205
+ 206
+ 2126
+ 11516
+ 34777
+ 28339 3845
+ 46609 3845
+ 28339 3930
+ 46609 3930
+ 46609 3930 8
+ 28339 19 3845 8
+ 46609 19 3845 8
+ 2075 1801 11254 107 255 21 19317
+ 94 23 27 31 228 30 21213 20752 39267 6405 9980
+ 4929 40071 2196 3236 8750 1764 37097 41168
+ 38111 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 38111 231 38111 257 38111 235 165 24629 38111 239
+ 2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 1737 10203 109160 1875 2222 2517 3342 12523 16
+ 28339
+ 46609
+ 228 46609
+ 1667 46609
+ 1742 46609
+ 1742 46609 1856 46609
+ 1737
+ 206 1857
+ 14 4515
+ 28339 19 1770 14 1954 8 4070 1955 1933 80503 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372
+ 26
+ 26 26
+ 26 26 26
+ 26 26 26 26
+ 26 26 26 26 26
+ 26 26 26 26 26 26
+ 26 26 26 26 26 26 26
+ 26 26 26 26 26 26 26 26
+ 26 26 26 26 26 26 26 26 26
+ 127731 51628 205 57788 18494 97469 126134 206 2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 11254 107 255 2226 107 255 228 26 228 26 26 228 26 26 26 228 26 26 26 26 228 26 26 26 26 26 228 26 26 26 26 26 26 228 26 26 26 26 26 26 26 228 26 26 26 26 26 26 26 26 228 26 21 26 228 26 2271 26 228 26 3834 26 182018 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 188568 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372 8391 158343 3512 40071 2196 3236 8750 1764 37097 41168 29721 32797 25646 3802 4975 4975 116167 57178 10251 154048 27292 1767 5125 2632 2155 91 2378 1919 1914 2782 19 2155 3354 1933 5470 38 2155 52 2068 5470 1767 4961 3059 1894 19 2155 43 1933 3026 2725 23186 38 2930 14 20676 1671 14 83 51
--- a/models/ggml-vocab-deepseek-coder.gguf.inp
+++ b/models/ggml-vocab-deepseek-coder.gguf.inp
@@ -1,3 +1,7 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__

 __ggml_vocab_test__
 
--- a/models/ggml-vocab-deepseek-coder.gguf.out
+++ b/models/ggml-vocab-deepseek-coder.gguf.out
@@ -1,3 +1,5 @@
+ 1050 207 19 207 19192 4217
+ 37 32009 71 6247

 207
 243
--- a/models/ggml-vocab-deepseek-llm.gguf.inp
+++ b/models/ggml-vocab-deepseek-llm.gguf.inp
@@ -1,3 +1,7 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__

 __ggml_vocab_test__
 
--- a/models/ggml-vocab-deepseek-llm.gguf.out
+++ b/models/ggml-vocab-deepseek-llm.gguf.out
@@ -1,3 +1,5 @@
+ 1052 207 19 207 19109 4223
+ 37 100014 71 6245

 207
 243
--- a/models/ggml-vocab-falcon.gguf.inp
+++ b/models/ggml-vocab-falcon.gguf.inp
@@ -1,3 +1,7 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__

 __ggml_vocab_test__
 
--- a/models/ggml-vocab-falcon.gguf.out
+++ b/models/ggml-vocab-falcon.gguf.out
@@ -1,3 +1,5 @@
+ 878 204 31 3068 133 2137
+ 28611 132 30042

 204
 258
--- a/models/ggml-vocab-gpt-2.gguf.inp
+++ b/models/ggml-vocab-gpt-2.gguf.inp
@@ -1,3 +1,7 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__

 __ggml_vocab_test__
 
--- a/models/ggml-vocab-gpt-2.gguf.out
+++ b/models/ggml-vocab-gpt-2.gguf.out
@@ -1,3 +1,5 @@
+ 798 604 25208 1933
+ 37 9116 71 11751

 220
 220 220
--- a/models/ggml-vocab-llama-bpe.gguf.inp
+++ b/models/ggml-vocab-llama-bpe.gguf.inp
@@ -1,3 +1,7 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__

 __ggml_vocab_test__
 
--- a/models/ggml-vocab-llama-bpe.gguf.out
+++ b/models/ggml-vocab-llama-bpe.gguf.out
@@ -1,3 +1,5 @@
+ 1142 220 19 220 27154 4038
+ 37 51853 261

 220
 256
--- a/models/ggml-vocab-llama-spm.gguf.inp
+++ b/models/ggml-vocab-llama-spm.gguf.inp
@@ -1,3 +1,7 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__

 __ggml_vocab_test__
 
--- a/models/ggml-vocab-llama-spm.gguf.out
+++ b/models/ggml-vocab-llama-spm.gguf.out
@@ -1,3 +1,5 @@
+ 474 287 29871 29946 29871 30226 7378
+ 383 4000 261

 259
 1678
--- a/models/ggml-vocab-mpt.gguf.inp
+++ b/models/ggml-vocab-mpt.gguf.inp
@@ -1,3 +1,7 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__

 __ggml_vocab_test__
 
--- a/models/ggml-vocab-mpt.gguf.out
+++ b/models/ggml-vocab-mpt.gguf.out
@@ -1,3 +1,5 @@
+ 728 577 24142 2607
+ 39 26288 6554

 209
 50276
--- a/models/ggml-vocab-phi-3.gguf
+++ b/models/ggml-vocab-phi-3.gguf
--- a/models/ggml-vocab-phi-3.gguf.inp
+++ b/models/ggml-vocab-phi-3.gguf.inp
@@ -1,3 +1,7 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__

 __ggml_vocab_test__
 
--- a/models/ggml-vocab-phi-3.gguf.out
+++ b/models/ggml-vocab-phi-3.gguf.out
@@ -1,3 +1,5 @@
+ 474 287 29871 29946 29871 30226 7378
+ 383 4000 261

 259
 1678
--- a/models/ggml-vocab-qwen2.gguf
+++ b/models/ggml-vocab-qwen2.gguf
--- a/models/ggml-vocab-qwen2.gguf.inp
+++ b/models/ggml-vocab-qwen2.gguf.inp
@@ -0,0 +1,106 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
--- a/models/ggml-vocab-qwen2.gguf.out
+++ b/models/ggml-vocab-qwen2.gguf.out
@@ -0,0 +1,43 @@
+ 1122 220 19 220 26062 3951
+ 37 50753 261
+
+ 220
+ 256
+ 262
+ 197
+ 198
+ 271
+ 1406
+ 1572
+ 9707 1879
+ 21927 1879
+ 9707 4337
+ 21927 4337
+ 21927 4337 0
+ 9707 11 1879 0
+ 21927 11 1879 0
+ 419 374 11162 99 247 13 10821
+ 86 15 19 23 220 22 83 1963 41808 11472 2940 16739
+ 78762 14144 1456 13073 63471 33594 3038 133178 79012
+ 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 147805 148301 147270 44258 223 146848
+ 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 320 3243 42365 429 702 1181 1828 3950 8
+ 9707
+ 21927
+ 220 21927
+ 256 21927
+ 262 21927
+ 262 21927 198 262 21927
+ 320
+ 198 284
+ 6 11385
+ 9707 11 379 64848 0 2585 525 498 26525 223 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216
+ 18
+ 18 18
+ 18 18 18
+ 18 18 18 18
+ 18 18 18 18 18
+ 18 18 18 18 18 18
+ 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18 18
+ 198 4710 14731 65497 7847 1572 2303 78672 10947 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 11162 99 247 149955 220 18 220 18 18 220 18 18 18 220 18 18 18 18 220 18 18 18 18 18 220 18 18 18 18 18 18 220 18 18 18 18 18 18 18 220 18 18 18 18 18 18 18 18 220 18 13 18 220 18 496 18 220 18 1112 18 220 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 144534 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216 55460 53237 18658 14144 1456 13073 63471 33594 3038 133178 79012 3355 4605 4605 13874 13874 73594 3014 3014 28149 17085 2928 26610 7646 358 3003 1012 364 83 813 566 594 1052 11 364 787 498 2704 30 364 44 537 2704 358 3278 1281 432 11 364 35 498 1075 1045 15243 30 1205 6 42612 264 63866 43
--- a/models/ggml-vocab-refact.gguf
+++ b/models/ggml-vocab-refact.gguf
--- a/models/ggml-vocab-refact.gguf.inp
+++ b/models/ggml-vocab-refact.gguf.inp
@@ -0,0 +1,106 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
--- a/models/ggml-vocab-refact.gguf.out
+++ b/models/ggml-vocab-refact.gguf.out
@@ -0,0 +1,43 @@
+ 4833 225 38 225 143 140 17723
+ 56 2006 3935 265
+
+ 225
+ 261
+ 264
+ 202
+ 203
+ 478
+ 2831
+ 15773
+ 8279 5788
+ 12000 5788
+ 8279 10896
+ 12000 10896
+ 12000 10896 19
+ 8279 30 5788 19
+ 12000 30 5788 19
+ 458 438 5945 118 252 32 3766
+ 105 34 38 42 225 41 102 1707 12530 10180 1479 8278
+ 39862 8372 1039 9446 40242 13852 2053 8949 12531 1520 10700
+ 14574 227 14574 133 14574 246 30457 238 14574 242 30457 229 14574 249 14574 134 14574 258 30457 228 14574 258 14574 114 14574 133 14574 232 14574 228 14574 254 14574 232 30457 228 14574 236
+ 3807 253 227 308 4382 27 18458 133 46113 44967 123 13868 308 12565 19775 33071 40824 733 27 41889 308 2585 22680 688 1401 2819 4369 2404 27
+ 8279
+ 12000
+ 225 12000
+ 261 12000
+ 264 12000
+ 264 12000 284 12000
+ 308
+ 203 280
+ 25 34666
+ 8279 30 533 25 464 19 4971 884 844 18458 228 1018 4982 13368 2909 9513 17827 35 37 35 38 35 39 35 11873 47838
+ 37
+ 37 37
+ 37 37 37
+ 37 37 37 37
+ 37 37 37 37 37
+ 37 37 37 37 37 37
+ 37 37 37 37 37 37 37
+ 37 37 37 37 37 37 37 37
+ 37 37 37 37 37 37 37 37 37
+ 334 719 8878 202 10885 4222 16104 28570 203 3807 253 227 308 4382 27 18458 133 46113 44967 123 13868 308 12565 19775 33071 40824 733 27 41889 5945 118 252 3807 118 252 225 37 225 37 37 225 37 37 37 225 37 37 37 37 225 37 37 37 37 37 225 37 37 37 37 37 37 225 37 37 37 37 37 37 37 225 37 37 37 37 37 37 37 37 225 37 32 37 225 37 497 37 225 37 1179 37 225 14574 227 14574 133 14574 246 30457 238 14574 242 30457 229 14574 249 14574 134 14574 258 30457 228 14574 258 14574 114 14574 133 14574 232 36628 228 1018 4982 13368 2909 9513 17827 35 37 35 38 35 39 35 11873 47838 20921 16623 13028 8372 1039 9446 40242 13852 2053 8949 12531 1520 10700 5881 9592 13299 914 31753 31359 9163 3202 35472 10397 439 4763 2583 330 102 1455 938 1182 2017 30 330 613 844 3654 49 330 63 646 3654 439 4621 1930 561 30 330 54 844 2124 1629 35993 49 2688 25 7709 312 25 94 62
--- a/models/ggml-vocab-starcoder.gguf.inp
+++ b/models/ggml-vocab-starcoder.gguf.inp
@@ -1,3 +1,7 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__

 __ggml_vocab_test__
 
--- a/models/ggml-vocab-starcoder.gguf.out
+++ b/models/ggml-vocab-starcoder.gguf.out
@@ -1,3 +1,5 @@
+ 4850 244 57 244 162 159 17722
+ 75 2022 3943 284

 244
 280
--- a/requirements/requirements-convert.txt
+++ b/requirements/requirements-convert.txt
@@ -1,5 +1,5 @@
 numpy~=1.24.4
 sentencepiece~=0.1.98
-transformers>=4.35.2,<5.0.0
+transformers>=4.40.1,<5.0.0
 gguf>=0.1.0
 protobuf>=4.21.0,<5.0.0
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jared Van Bortel	494f70f939	cmake : fix typo	2024-05-08 16:24:02 -04:00
Johannes Gäßler	c12452c7ae	JSON: [key] -> .at(key), assert() -> GGML_ASSERT (#7143 )	2024-05-08 21:53:08 +02:00
Georgi Gerganov	9da243b36a	Revert "llava : add support for moondream vision language model (#6899 )" This reverts commit `46e12c4692`.	2024-05-08 22:14:39 +03:00
JohnnyB	bd1871fa2b	server : add themes + favicon (#6848 ) * Added themes support with two sample themes and a favicon. * Newline * Newline * Newline * Trailing whitespace * Increased opacity for contrast * Increase opacity. Check actions cancelled for some other priority job and I can't seem to manually re-run them, so MOAR OPACITY * Opacity action trigger. Trying to re-trigger the cancelled action. * One more opacity adjustment This Actions pipeline is failing for random issues. * Delete examples/server/themes/buttons_top/completion.js This will be served from the static string built-in to server. * Delete examples/server/themes/buttons_top/index.js This will be served from the static string built-in to server. * Delete examples/server/themes/wild/completion.js This will be served from the static string built-in to server. * Delete examples/server/themes/buttons_top/json-schema-to-grammar.mjs This will be served from the static string built-in to server. * Delete examples/server/themes/wild/index.js This will be served from the static string built-in to server. * Delete examples/server/themes/wild/json-schema-to-grammar.mjs This will be served from the static string built-in to server. * Replaced underscore.	2024-05-08 22:12:06 +03:00
Gilad S	26458af1d6	metal : use `vm_allocate` instead of `posix_memalign` on macOS (#7078 ) * fix: use `malloc` instead of `posix_memalign` in `ggml-metal.m` to make it not crash Electron proccesses * fix: typo * fix: use `vm_allocate` instead of `posix_memalign` * fix: don't call `newBufferWithBytesNoCopy` with `NULL` when `ggml_metal_host_malloc` returns `NULL` * fix: use `vm_allocate` only on macOS	2024-05-08 22:08:10 +03:00
Dawid Potocki	83330d8cd6	main : add --conversation / -cnv flag (#7108 )	2024-05-08 17:32:32 +03:00
Eve	465263d0cf	sgemm : AVX Q4_0 and Q8_0 (#6891 ) * basic avx implementation * style * combine denibble with load * reduce 256 to 128 (and back!) conversions * sse load * Update sgemm.cpp * oops oops	2024-05-08 17:29:23 +03:00
Johan	911b3900dd	server : add_special option for tokenize endpoint (#7059 )	2024-05-08 15:27:58 +03:00
20kdc	ad211edef5	convert.py : --vocab-only generates false but valid params (#7027 ) An example of how this might be used in the style of baby-llama will be attached with this PR.	2024-05-08 15:22:32 +03:00
Ren Xuancheng	229ffff872	llama : add BPE pre-tokenization for Qwen2 (#7114 ) * Add BPE pre-tokenization for Qwen2. * minor : fixes --------- Co-authored-by: Ren Xuancheng <17811943+jklj077@users.noreply.github.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-05-08 15:06:43 +03:00
Xuan Son Nguyen	1fd9c1741d	clean up json_value & server_log (#7142 )	2024-05-08 13:24:14 +02:00
DAN™	4cd621c26d	convert : add BPE pre-tokenization for DBRX (#7132 ) * Add BPE pre-tokenization for DBRX. * Add vocab GGUFs. * Remove test. * Remove GGUFs.	2024-05-08 13:43:23 +03:00
Georgi Gerganov	7e0b6a7b3b	py : also print the normalizers	2024-05-08 12:47:07 +03:00
Brian	acdce3cdef	compare-llama-bench.py: add missing basicConfig (#7138 ) * compare-llama-bench.py: add missing basicConfig * compare-llama-bench.py: Add line break between error message and print_help() * Add regular print() markdown table	2024-05-08 10:54:39 +02:00
Justine Tunney	3855416027	ggml : introduce bfloat16 support (#6412 ) * Introduce bfloat16 support Many models on Hugging Face (e.g. Mistral, TinyLLaMA) use bfloat16 as their canonical floating point format. ┌sign │ │ ┌exponent │ │ │ │ ┌mantissa │ │ │ │┌──┴───┐┌─┴───┐ 0b0000000000000000 brain16 This encoding has the same number of exponent bits as float32. That makes conversion relatively straightforward, even in the absence of hardware support. For example, converting brain16 to binary32 means simply shifting 16 bits to the left. ┌sign │ │ ┌exponent │ │ │ │ ┌mantissa │ │ │ │┌──┴───┐┌─┴───────────────────┐ 0b00000000000000000000000000000000 IEEE binary32 The issue is that converting bf16 to fp16 can result in information loss. Only 13% of bf16 numbers can be precisely represented in fp16 which in practice ends up being 99.71% of Mistral 7b v0.2's weights however there is currently no way other than fp32 to get the others ┌sign │ │ ┌exponent │ │ │ │ ┌mantissa │ │ │ │┌─┴─┐┌─┴──────┐ 0b0000000000000000 IEEE binary16 This change fixes that, by adding a bf16 data type to GGML. Support for CPU inference has been implemented along with optimizations for the AVX2, AVX512, and AVX512BF16 ISAs. Perplexity on Mistral 7b 0.2 improves somewhere around -0.0024 to -0.0046 compared to using fp16 * Remove GGML code that's not needed * Minimize the GGML API surface area for BF16 * Remove bf16 luts * Make the GGML header look nicer * Fix documentation * Apply ggerganov's fixes for test-backend-ops * Add BF16 code for new ggml_validate_row_data() function	2024-05-08 09:30:09 +03:00
Georgi Gerganov	c0e6fbf8c3	metal : fix unused warning	2024-05-08 09:14:50 +03:00
Jeximo	c780e75305	Further tidy on Android instructions README.md (#7077 ) * Further tidy on Android instructions README.md Fixed some logic when following readme direction * Clean up redundent information A new user arriving will see simple directions on llama.cpp homepage * corrected puncuation Period after cmake, colon after termux * re-word for clarity method seems to be more correct, instead of alternative in this context * Organized required packages per build type building llama.cpp with NDK on a pc doesn't require installing clang, cmake, git, or wget in termux. * README.md corrected title * fix trailing whitespace	2024-05-08 02:26:43 +02:00
jukofyork	48b2f9c1fc	Fixed save_imatrix to match old behaviour for MoE (#7099 ) * Fixed save_imatrix to match old behaviour for MoE This fix is simple and clear, but unnecessarily doubles the memory overhead.. * Fixed missing idx variable * Unconditionally increment ncall Co-authored-by: slaren <slarengh@gmail.com> * Fixed 2 bugs in save_imatrix() - Fixed segfault bug because the counts vector needed to be created. - Fixed pre-existing bug didn't actually add to the counts for "--combine" option. * ncall needs summing too * Trailing whitespace --------- Co-authored-by: slaren <slarengh@gmail.com>	2024-05-08 02:24:16 +02:00
Johannes Gäßler	af0a5b6163	server: fix incorrectly reported token probabilities (#7125 ) * server: normalize token probabilities * fix temperature == 0.0f	2024-05-07 23:07:58 +02:00
nopperl	b6aa670203	Fix OLMo HF to GGUF conversion (#6910 )	2024-05-07 21:39:43 +02:00
Kyle Mistele	260b7c6529	server : update readme with undocumented options (#7013 )	2024-05-07 21:44:29 +03:00
Georgi Gerganov	53d6c52e22	readme : update hot topics	2024-05-07 21:43:13 +03:00
RhinoDevel	3af34c1d1b	main : update log text (EOS to EOG) (#7104 ) * Update log text (EOS to EOG) The log text "found EOS" is no longer always correct, here, because there is now an is-EOG check that also returns true for EOT. * Improve log msg. further by using "an" instead of "some". As suggested, to avoid misunderstanding (no multiple EOG tokens found, just one).	2024-05-07 20:51:31 +03:00
omahs	04976db7a8	docs: fix typos (#7124 ) * fix typo * fix typos * fix typo * fix typos * fix typo * fix typos	2024-05-07 18:20:33 +03:00
Georgi Gerganov	947d3ad27d	ci : add GG_BUILD_EXTRA_TESTS_0 env (#7098 ) * ci : add GG_BUILD_EXTRA_TESTS_0 env ggml-ci * Update run.sh ggml-ci	2024-05-07 11:08:49 +03:00
William Tambellini	858f6b73f6	Add an option to build without CUDA VMM (#7067 ) Add an option to build ggml cuda without CUDA VMM resolves https://github.com/ggerganov/llama.cpp/issues/6889 https://forums.developer.nvidia.com/t/potential-nvshmem-allocated-memory-performance-issue/275416/4	2024-05-06 20:12:14 +02:00
Georgi Gerganov	b3a995b416	flake.lock: Update (#7079 ) Flake lock file updates: • Updated input 'flake-parts': 'github:hercules-ci/flake-parts/9126214d0a59633752a136528f5f3b9aa8565b7d?narHash=sha256-sB4SWl2lX95bExY2gMFG5HIzvva5AVMJd4Igm%2BGpZNw%3D' (2024-04-01) → 'github:hercules-ci/flake-parts/e5d10a24b66c3ea8f150e47dfdb0416ab7c3390e?narHash=sha256-yzcRNDoyVP7%2BSCNX0wmuDju1NUCt8Dz9%2BlyUXEI0dbI%3D' (2024-05-02) • Updated input 'flake-parts/nixpkgs-lib': 'github:NixOS/nixpkgs/d8fe5e6c92d0d190646fb9f1056741a229980089?dir=lib&narHash=sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk%3D' (2024-03-29) → '`50eb7ecf4c`.tar.gz?narHash=sha256-QBx10%2Bk6JWz6u7VsohfSw8g8hjdBZEf8CFzXH1/1Z94%3D' (2024-05-02) • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/7bb2ccd8cdc44c91edba16c48d2c8f331fb3d856?narHash=sha256-Drmja/f5MRHZCskS6mvzFqxEaZMeciScCTFxWVLqWEY%3D' (2024-04-25) → 'github:NixOS/nixpkgs/63c3a29ca82437c87573e4c6919b09a24ea61b0f?narHash=sha256-4cPymbty65RvF1DWQfc%2BBc8B233A1BWxJnNULJKQ1EY%3D' (2024-05-02) Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2024-05-06 08:36:06 -07:00
Georgi Gerganov	bcdee0daa7	minor : fix trailing whitespace	2024-05-06 09:31:30 +03:00
kunnis	628b299106	Adding support for the --numa argument for llama-bench. (#7080 )	2024-05-05 14:17:47 +02:00
Sigbjørn Skjæret	8f8acc8683	Disable benchmark on forked repo (#7034 ) * Disable benchmark on forked repo * only check owner on schedule event * check owner on push also * more readable as multi-line * ternary won't work * style++ * test++ * enable actions debug * test-- * remove debug * test++ * do debug where we can get logs * test-- * this is driving me crazy * correct github.event usage * remove test condition * correct github.event usage * test++ * test-- * event_name is pull_request_target * test++ * test-- * update ref checks	2024-05-05 13:38:55 +02:00
Lyle Dean	ca36326020	readme : add note that LLaMA 3 is not supported with convert.py (#7065 )	2024-05-05 08:21:46 +03:00
DAN™	889bdd7686	command-r : add BPE pre-tokenization (#7063 ) * Add BPE pre-tokenization for Command-R/R+. * Bump transformers convert requirement. * command-r : add individual digits regex --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-05-05 08:19:30 +03:00
Brian	6fbd432211	py : logging and flake8 suppression refactoring (#7081 ) Set one as executable and add basicConfig() to another. Also added noqa tag to test scripts.	2024-05-05 08:07:48 +03:00
Xuan Son Nguyen	842500144e	gguf-split: add --no-tensor-first-split (#7072 )	2024-05-04 18:56:22 +02:00
Jeximo	cf768b7e71	Tidy Android Instructions README.md (#7016 ) * Tidy Android Instructions README.md Remove CLBlast instructions(outdated), added OpenBlas. * don't assume git is installed Added apt install git, so that git clone works * removed OpenBlas Linked to Linux build instructions * fix typo Remove word "run" * correct style Co-authored-by: slaren <slarengh@gmail.com> * correct grammar Co-authored-by: slaren <slarengh@gmail.com> * delete reference to Android API * remove Fdroid reference, link directly to Termux Fdroid is not required Co-authored-by: slaren <slarengh@gmail.com> * Update README.md Co-authored-by: slaren <slarengh@gmail.com> --------- Co-authored-by: slaren <slarengh@gmail.com>	2024-05-04 18:10:15 +02:00
viric	fcd84a0f5a	Fix Linux /sys cpu path to guess number of cores (#7064 )	2024-05-04 15:26:53 +02:00
maor-ps	03fb8a002d	If first token generated from the server is the stop word the server will crash (#7038 ) This will reproduce the issue in llama13b { 'prompt': 'Q: hello world \nA: ', 'stop': ['\n'], 'temperature': 0.0, 'n_predict': 10, 'cache_prompt': True, 'n_probs': 10 }	2024-05-04 11:06:40 +02:00
Georgi Gerganov	92139b90af	tests : add test-tokenizer-0.sh + fix some tokenizers (#7036 ) * tests : add test-tokenizer-0.sh * unicode : add all unicode number ranges * starcoder : fix pre-tokenizer * tests : add test that fails with DeepSeek tokenizers * falcon : fix regex * unicode : regenerate unicode tables * refact : add tokenizer model * lint : fix * tests : disable failing tests ggml-ci * refact : add tests files ggml-ci * convert : print -> logging ggml-ci * lint : fix * unicode : digit -> number * phi-3 : update	2024-05-04 08:32:32 +03:00
Brian	a2ac89d6ef	convert.py : add python logging instead of print() (#6511 ) * convert.py: add python logging instead of print() * convert.py: verbose flag takes priority over dump flag log suppression * convert.py: named instance logging * convert.py: use explicit logger id string * convert.py: convert extra print() to named logger * convert.py: sys.stderr.write --> logger.error * .py: Convert all python scripts to use logging module requirements.txt: remove extra line * flake8: update flake8 ignore and exclude to match ci settings * gh-actions: add flake8-no-print to flake8 lint step * pre-commit: add flake8-no-print to flake8 and also update pre-commit version * convert-hf-to-gguf.py: print() to logger conversion * .py: logging basiconfig refactor to use conditional expression .py: removed commented out logging fixup! .py: logging basiconfig refactor to use conditional expression constant.py: logger.error then exit should be a raise exception instead * .py: Convert logger error and sys.exit() into a raise exception (for atypical error) gguf-convert-endian.py: refactor convert_byteorder() to use tqdm progressbar * verify-checksum-model.py: This is the result of the program, it should be printed to stdout. * compare-llama-bench.py: add blank line for readability during missing repo response * reader.py: read_gguf_file() use print() over logging * convert.py: warning goes to stderr and won't hurt the dump output * gguf-dump.py: dump_metadata() should print to stdout * convert-hf-to-gguf.py: print --> logger.debug or ValueError() * verify-checksum-models.py: use print() for printing table * .py: refactor logging.basicConfig() gguf-py/gguf/.py: use __name__ as logger name Since they will be imported and not run directly. python-lint.yml: use .flake8 file instead * constants.py: logger no longer required * convert-hf-to-gguf.py: add additional logging * convert-hf-to-gguf.py: print() --> logger * .py: fix flake8 warnings revert changes to convert-hf-to-gguf.py for get_name() * convert-hf-to-gguf-update.py: use triple quoted f-string instead * .py: accidentally corrected the wrong line *.py: add compilade warning suggestions and style fixes	2024-05-03 22:36:41 +03:00
Daniel Bevenius	433def286e	llama : rename ctx to user_data in progress_callback (#7045 ) * llama : rename ctx to user_data in progress_callback This commit renames the `ctx` parameter to `user_data` in the `llama_progress_callback` typedef. The motivation for this is that other callbacks use `user_data` or `data`, and using `ctx` in this case might be confusing as it could be confused with `llama_context`. --------- Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>	2024-05-03 15:24:30 +02:00
Bartowski	60325fa56f	Remove .attention from skipped tensors to match more accurately (#7051 )	2024-05-03 01:49:09 +02:00