ggml : update comments [no ci]

ggml : remove ggml_cplan + rework ggml_cgraph
ggml-ci
2026-04-30 16:47:31 +03:00 · 2024-09-11 13:16:39 +03:00 · 2024-09-11 13:05:10 +03:00 · 2024-09-11 13:03:18 +03:00 · 2024-09-11 10:07:21 +03:00 · 2024-09-10 16:42:16 +03:00
87 changed files with 3029 additions and 3077 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -23,9 +23,6 @@ env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1

 jobs:
  macOS-latest-cmake-arm64:
@@ -378,7 +375,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -404,7 +401,7 @@ jobs:
    continue-on-error: true

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v2

      - name: add oneAPI to apt
        shell: bash
@@ -445,7 +442,7 @@ jobs:
    continue-on-error: true

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v2

      - name: add oneAPI to apt
        shell: bash
@@ -549,7 +546,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -579,7 +576,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -613,7 +610,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -972,14 +969,14 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Install
        id: depends
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP SDK installation"
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -20,12 +20,6 @@ on:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']

-env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
-
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true
@@ -179,7 +173,6 @@ jobs:
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
-          $env:PYTHONIOENCODING = ":replace"
          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp

      - name: Slow tests
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -139,16 +139,10 @@ set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location o
 # determining _precisely_ which defines are necessary for the llama-config
 # package.
 #
-set(GGML_TRANSIENT_DEFINES)
 get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
 get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
-if (GGML_DIR_DEFINES)
-    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
-endif()
 get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
-if (GGML_TARGET_DEFINES)
-    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
-endif()
+set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
 get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)

 set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
--- a/48
+++ b/48
@@ -54,7 +54,6 @@ TEST_TARGETS = \
 	tests/test-grammar-parser \
 	tests/test-json-schema-to-grammar \
 	tests/test-llama-grammar \
-	tests/test-log \
 	tests/test-model-load-cancel \
 	tests/test-opt \
 	tests/test-quantize-fns \
@@ -149,14 +148,6 @@ GGML_NO_METAL := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_DISABLE_LOGS
-REMOVE_WARNING := 1
-endif
-
-ifdef LLAMA_SERVER_VERBOSE
-REMOVE_WARNING := 1
-endif
-
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
@@ -360,11 +351,19 @@ ifdef LLAMA_SANITIZE_UNDEFINED
 	MK_LDFLAGS  += -fsanitize=undefined -g
 endif

+ifdef LLAMA_SERVER_VERBOSE
+	MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
+endif
+
 ifdef LLAMA_SERVER_SSL
 	MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
 	MK_LDFLAGS += -lssl -lcrypto
 endif

+ifdef LLAMA_DISABLE_LOGS
+	MK_CPPFLAGS += -DLOG_DISABLE_LOGS
+endif # LLAMA_DISABLE_LOGS
+
 # warnings
 WARN_FLAGS = \
 	-Wall \
@@ -435,7 +434,7 @@ endif
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue

-ifndef RISCV_CROSS_COMPILE
+ifndef RISCV

 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
@@ -515,12 +514,7 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),)
 	MK_CXXFLAGS += -mlasx
 endif

-ifneq ($(filter riscv64%,$(UNAME_M)),)
-	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
-	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
-endif
-
-else # RISC-V CROSS COMPILATION
+else
 	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
 endif
@@ -932,7 +926,6 @@ OBJ_LLAMA = \
 OBJ_COMMON = \
 	common/common.o \
 	common/arg.o \
-	common/log.o \
 	common/console.o \
 	common/ngram-cache.o \
 	common/sampling.o \
@@ -1029,14 +1022,6 @@ $(info   - LLAMA_NO_CCACHE)
 $(info )
 endif

-ifdef REMOVE_WARNING
-$(info !!! REMOVAL WARNING !!!)
-$(info The following LLAMA_ options have been removed and are no longer supported)
-$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggerganov/llama.cpp/pull/9418))
-$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
-$(info )
-endif
-
 #
 # Build libraries
 #
@@ -1178,11 +1163,6 @@ common/arg.o: \
 	common/arg.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-common/log.o: \
-	common/log.cpp \
-	common/log.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
 common/sampling.o: \
 	common/sampling.cpp \
 	common/sampling.h \
@@ -1361,7 +1341,7 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

 llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
-	$(OBJ_ALL)
+	$(OBJ_GGML) $(OBJ_LLAMA)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

@@ -1455,7 +1435,6 @@ llama-server: \
 	examples/server/system-prompts.js.hpp \
 	examples/server/prompt-formats.js.hpp \
 	examples/server/json-schema-to-grammar.mjs.hpp \
-	examples/server/loading.html.hpp \
 	common/json.hpp \
 	common/stb_image.h \
 	$(OBJ_ALL)
@@ -1543,11 +1522,6 @@ tests/test-llama-grammar: tests/test-llama-grammar.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-log: tests/test-log.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 tests/test-grammar-parser: tests/test-grammar-parser.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
--- a/README.md
+++ b/README.md
@@ -89,7 +89,6 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
 - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
 - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)

 (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))

@@ -173,7 +172,6 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)

 **Infrastructure:**

--- a/ci/run.sh
+++ b/ci/run.sh
@@ -737,9 +737,6 @@ function gg_sum_embd_bge_small {

 ## main

-export LLAMA_LOG_PREFIX=1
-export LLAMA_LOG_TIMESTAMPS=1
-
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
    rm -rf ${SRC}/models-mnt
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -51,23 +51,21 @@ endif()
 set(TARGET common)

 add_library(${TARGET} STATIC
-    arg.cpp
-    arg.h
    base64.hpp
-    common.cpp
    common.h
-    console.cpp
-    console.h
-    json-schema-to-grammar.cpp
-    json.hpp
-    log.cpp
-    log.h
-    ngram-cache.cpp
-    ngram-cache.h
-    sampling.cpp
+    common.cpp
+    arg.h
+    arg.cpp
    sampling.h
-    train.cpp
+    sampling.cpp
+    console.h
+    console.cpp
+    json.hpp
+    json-schema-to-grammar.cpp
    train.h
+    train.cpp
+    ngram-cache.h
+    ngram-cache.cpp
    )

 if (BUILD_SHARED_LIBS)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1,17 +1,15 @@
 #include "arg.h"

-#include "log.h"
 #include "sampling.h"

 #include <algorithm>
-#include <climits>
-#include <cstdarg>
+#include <string>
+#include <vector>
+#include <set>
 #include <fstream>
 #include <regex>
-#include <set>
-#include <string>
-#include <thread>
-#include <vector>
+#include <cstdarg>
+#include <climits>

 #include "json-schema-to-grammar.h"

@@ -175,6 +173,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
    std::string arg;
    const std::string arg_prefix = "--";
    gpt_params & params = ctx_arg.params;
+    gpt_sampler_params & sparams = params.sparams;

    std::unordered_map<std::string, llama_arg *> arg_to_options;
    for (auto & opt : ctx_arg.options) {
@@ -284,6 +283,10 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
        params.kv_overrides.back().key[0] = 0;
    }

+    if (sparams.seed == LLAMA_DEFAULT_SEED) {
+        sparams.seed = time(NULL);
+    }
+
    return true;
 }

@@ -385,6 +388,20 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            exit(0);
        }
    ));
+    add_opt(llama_arg(
+        {"-v", "--verbose"},
+        "print verbose information",
+        [](gpt_params & params) {
+            params.verbosity = 1;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--verbosity"}, "N",
+        format("set specific verbosity level (default: %d)", params.verbosity),
+        [](gpt_params & params, int value) {
+            params.verbosity = value;
+        }
+    ));
    add_opt(llama_arg(
        {"--verbose-prompt"},
        format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
@@ -405,7 +422,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
        [](gpt_params & params) {
            params.use_color = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
    add_opt(llama_arg(
        {"-t", "--threads"}, "N",
        format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -685,13 +702,6 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            params.n_keep = value;
        }
    ));
-    add_opt(llama_arg(
-        {"--no-context-shift"},
-        format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
-        [](gpt_params & params) {
-            params.ctx_shift = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(llama_arg(
        {"--chunks"}, "N",
        format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -715,14 +725,6 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            params.prompt = value;
        }
    ));
-    add_opt(llama_arg(
-        {"--no-perf"},
-        format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
-        [](gpt_params & params) {
-            params.no_perf = true;
-            params.sparams.no_perf = true;
-        }
-    ).set_env("LLAMA_ARG_NO_PERF"));
    add_opt(llama_arg(
        {"-f", "--file"}, "FNAME",
        "a file containing the prompt (default: none)",
@@ -821,7 +823,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
        [](gpt_params & params) {
            params.special = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(llama_arg(
        {"-cnv", "--conversation"},
        format(
@@ -871,7 +873,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            params.input_prefix = value;
            params.enable_chat_template = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(llama_arg(
        {"--in-suffix"}, "STRING",
        "string to suffix after user inputs with (default: empty)",
@@ -879,7 +881,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            params.input_suffix = value;
            params.enable_chat_template = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(llama_arg(
        {"--no-warmup"},
        "skip warming up the model with an empty run",
@@ -907,7 +909,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
    ).set_sparam());
    add_opt(llama_arg(
        {"-s", "--seed"}, "SEED",
-        format("RNG seed (default: %u, use random seed for %u)", params.sparams.seed, LLAMA_DEFAULT_SEED),
+        format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed),
        [](gpt_params & params, const std::string & value) {
            params.sparams.seed = std::stoul(value);
        }
@@ -1420,18 +1422,20 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
                params.split_mode = LLAMA_SPLIT_MODE_NONE;
            } else if (arg_next == "layer") {
                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
-            } else if (arg_next == "row") {
+            }
+            else if (arg_next == "row") {
 #ifdef GGML_USE_SYCL
                fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
                exit(1);
 #endif // GGML_USE_SYCL
                params.split_mode = LLAMA_SPLIT_MODE_ROW;
-            } else {
+            }
+            else {
                throw std::invalid_argument("invalid value");
            }
-            if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
-            }
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
        }
    ));
    add_opt(llama_arg(
@@ -1451,14 +1455,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            }
            for (size_t i = 0; i < llama_max_devices(); ++i) {
                if (i < split_arg.size()) {
-                    params.tensor_split[i] = std::stof(split_arg[i]);
+                        params.tensor_split[i] = std::stof(split_arg[i]);
                } else {
-                    params.tensor_split[i] = 0.0f;
+                        params.tensor_split[i] = 0.0f;
                }
            }
-            if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
-            }
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
        }
    ));
    add_opt(llama_arg(
@@ -1466,9 +1470,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
        format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
        [](gpt_params & params, int value) {
            params.main_gpu = value;
-            if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
-            }
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
        }
    ));
    add_opt(llama_arg(
@@ -1819,6 +1823,19 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            params.system_prompt = system_prompt;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--log-format"}, "{text, json}",
+        "log output format: json or text (default: json)",
+        [](gpt_params & params, const std::string & value) {
+            if (value == "json") {
+                params.log_json = true;
+            } else if (value == "text") {
+                params.log_json = false;
+            } else {
+                throw std::invalid_argument("invalid value");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(llama_arg(
        {"--metrics"},
        format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
@@ -1938,57 +1955,40 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            else { std::invalid_argument("invalid value"); }
        }
    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+#ifndef LOG_DISABLE_LOGS
+    // TODO: make this looks less weird
+    add_opt(llama_arg(
+        {"--log-test"},
+        "Log test",
+        [](gpt_params &) { log_param_single_parse("--log-test"); }
+    ));
    add_opt(llama_arg(
        {"--log-disable"},
        "Log disable",
-        [](gpt_params &) {
-            gpt_log_pause(gpt_log_main());
-        }
+        [](gpt_params &) { log_param_single_parse("--log-disable"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-enable"},
+        "Log enable",
+        [](gpt_params &) { log_param_single_parse("--log-enable"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-new"},
+        "Log new",
+        [](gpt_params &) { log_param_single_parse("--log-new"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-append"},
+        "Log append",
+        [](gpt_params &) { log_param_single_parse("--log-append"); }
    ));
    add_opt(llama_arg(
        {"--log-file"}, "FNAME",
-        "Log to file",
-        [](gpt_params &, const std::string & value) {
-            gpt_log_set_file(gpt_log_main(), value.c_str());
-        }
+        "Log file",
+        [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); }
    ));
-    add_opt(llama_arg(
-        {"--log-colors"},
-        "Enable colored logging",
-        [](gpt_params &) {
-            gpt_log_set_colors(gpt_log_main(), true);
-        }
-    ).set_env("LLAMA_LOG_COLORS"));
-    add_opt(llama_arg(
-        {"-v", "--verbose", "--log-verbose"},
-        "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
-        [](gpt_params & params) {
-            params.verbosity = INT_MAX;
-            gpt_log_set_verbosity_thold(INT_MAX);
-        }
-    ));
-    add_opt(llama_arg(
-        {"-lv", "--verbosity", "--log-verbosity"}, "N",
-        "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
-        [](gpt_params & params, int value) {
-            params.verbosity = value;
-            gpt_log_set_verbosity_thold(value);
-        }
-    ).set_env("LLAMA_LOG_VERBOSITY"));
-    add_opt(llama_arg(
-        {"--log-prefix"},
-        "Enable prefx in log messages",
-        [](gpt_params &) {
-            gpt_log_set_prefix(gpt_log_main(), true);
-        }
-    ).set_env("LLAMA_LOG_PREFIX"));
-    add_opt(llama_arg(
-        {"--log-timestamps"},
-        "Enable timestamps in log messages",
-        [](gpt_params &) {
-            gpt_log_set_timestamps(gpt_log_main(), true);
-        }
-    ).set_env("LLAMA_LOG_TIMESTAMPS"));
+#endif // LOG_DISABLE_LOGS

    return ctx_arg;
 }
+
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -3,7 +3,6 @@
 #endif

 #include "common.h"
-#include "log.h"
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
@@ -26,7 +25,6 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include <thread>

 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -50,6 +48,7 @@
 #if defined(LLAMA_USE_CURL)
 #include <curl/curl.h>
 #include <curl/easy.h>
+#include <thread>
 #include <future>
 #endif

@@ -57,6 +56,14 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
+#define GGML_USE_CUDA_SYCL
+#endif
+
+#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
+#define GGML_USE_CUDA_SYCL_VULKAN
+#endif
+
 #if defined(LLAMA_USE_CURL)
 #ifdef __linux__
 #include <linux/limits.h>
@@ -227,7 +234,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
    }

    if (!SetPriorityClass(GetCurrentProcess(), p)) {
-        LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
+        fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
        return false;
    }

@@ -252,7 +259,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
    }

    if (!setpriority(PRIO_PROCESS, 0, p)) {
-        LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
+        fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
        return false;
    }
    return true;
@@ -285,14 +292,14 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)

    if (n_set && n_set < cpuparams.n_threads) {
        // Not enough set bits, may experience performance issues.
-        LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
+        fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
    }
 }

 bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
    size_t dash_loc = range.find('-');
    if (dash_loc == std::string::npos) {
-        LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
+        fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
        return false;
    }

@@ -304,7 +311,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
    } else {
        start_i = std::stoull(range.substr(0, dash_loc));
        if (start_i >= GGML_MAX_N_THREADS) {
-            LOG_ERR("Start index out of bounds!\n");
+            fprintf(stderr, "Start index out of bounds!\n");
            return false;
        }
    }
@@ -314,7 +321,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
    } else {
        end_i = std::stoull(range.substr(dash_loc + 1));
        if (end_i >= GGML_MAX_N_THREADS) {
-            LOG_ERR("End index out of bounds!\n");
+            fprintf(stderr, "End index out of bounds!\n");
            return false;
        }
    }
@@ -349,7 +356,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
        } else if (c >= 'A' && c <= 'F') {
            id -= 'A' - 10;
        } else {
-            LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
+            fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
            return false;
        }

@@ -362,22 +369,6 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
    return true;
 }

-void gpt_init() {
-    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
-        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
-            gpt_log_add(gpt_log_main(), level, "%s", text);
-        }
-    }, NULL);
-
-#ifdef NDEBUG
-    const char * build_type = "";
-#else
-    const char * build_type = " (debug)";
-#endif
-
-    LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
-}
-
 std::string gpt_params_get_system_info(const gpt_params & params) {
    std::ostringstream os;

@@ -458,94 +449,6 @@ void string_replace_all(std::string & s, const std::string & search, const std::
    s = std::move(builder);
 }

-std::string string_from(bool value) {
-    return value ? "true" : "false";
-}
-
-std::string string_from(const std::vector<int> & values) {
-    std::stringstream buf;
-
-    buf << "[ ";
-    bool first = true;
-    for (auto e : values) {
-        if (first) {
-            first = false;
-        } else {
-            buf << ", ";
-        }
-        buf << std::to_string(e);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
-    std::stringstream buf;
-
-    buf << "[ ";
-
-    bool first = true;
-    for (const auto & token : tokens) {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = llama_token_to_piece(ctx, token);
-
-        detokenized.erase(
-            std::remove_if(
-                detokenized.begin(),
-                detokenized.end(),
-                [](const unsigned char c) { return !std::isprint(c); }),
-            detokenized.end());
-
-        buf << "'" << detokenized << "'"
-            << ":" << std::to_string(token);
-    }
-
-    buf << " ]";
-
-    return buf.str();
-}
-
-std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
-    std::stringstream buf;
-
-    buf << "[ ";
-
-    bool first = true;
-    for (int i = 0; i < batch.n_tokens; ++i) {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
-
-        detokenized.erase(
-                std::remove_if(
-                    detokenized.begin(),
-                    detokenized.end(),
-                    [](const unsigned char c) { return !std::isprint(c); }),
-                detokenized.end());
-
-        buf << "\n" << std::to_string(i)
-            << ":token '" << detokenized << "'"
-            << ":pos " << std::to_string(batch.pos[i])
-            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
-            << ":seq_id " << std::to_string(batch.seq_id[i][0])
-            << ":logits " << std::to_string(batch.logits[i]);
-    }
-
-    buf << " ]";
-
-    return buf.str();
-}
-
 void string_process_escapes(std::string & input) {
    std::size_t input_len = input.length();
    std::size_t output_idx = 0;
@@ -586,7 +489,7 @@ void string_process_escapes(std::string & input) {
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
    const char * sep = strchr(data, '=');
    if (sep == nullptr || sep - data >= 128) {
-        LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
+        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
        return false;
    }
    llama_model_kv_override kvo;
@@ -609,20 +512,20 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
        } else if (std::strcmp(sep, "false") == 0) {
            kvo.val_bool = false;
        } else {
-            LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
+            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
            return false;
        }
    } else if (strncmp(sep, "str:", 4) == 0) {
        sep += 4;
        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
        if (strlen(sep) > 127) {
-            LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
+            fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
            return false;
        }
        strncpy(kvo.val_str, sep, 127);
        kvo.val_str[127] = '\0';
    } else {
-        LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
+        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
        return false;
    }
    overrides.emplace_back(std::move(kvo));
@@ -834,7 +737,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
    }

    if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
+        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
        return iparams;
    }

@@ -842,7 +745,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {

    llama_context * lctx = llama_new_context_with_model(model, cparams);
    if (lctx == NULL) {
-        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
+        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
        llama_free_model(model);
        return iparams;
    }
@@ -878,7 +781,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        loaded_la.scale = la.scale;
        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
        if (loaded_la.adapter == nullptr) {
-            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
+            fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
            llama_free(lctx);
            llama_free_model(model);
            return iparams;
@@ -890,12 +793,12 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
    }

    if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
-        LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
+        fprintf(stderr, "%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
        params.sparams.ignore_eos = false;
    }

    if (params.warmup) {
-        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
+        LOG("warming up the model with an empty run\n");

        std::vector<llama_token> tmp;
        llama_token bos = llama_token_bos(model);
@@ -925,7 +828,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        }
        llama_kv_cache_clear(lctx);
        llama_synchronize(lctx);
-        llama_perf_context_reset(lctx);
+        llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
    }

    iparams.model   = model;
@@ -1021,7 +924,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.cb_eval_user_data = params.cb_eval_user_data;
    cparams.offload_kqv       = !params.no_kv_offload;
    cparams.flash_attn        = params.flash_attn;
-    cparams.no_perf           = params.no_perf;

    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
@@ -1047,44 +949,17 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p

 #ifdef LLAMA_USE_CURL

-#define CURL_MAX_RETRY 3
-#define CURL_RETRY_DELAY_SECONDS 2
-
-
 static bool starts_with(const std::string & str, const std::string & prefix) {
    // While we wait for C++20's std::string::starts_with...
    return str.rfind(prefix, 0) == 0;
 }

-static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
-    int remaining_attempts = max_attempts;
-
-    while (remaining_attempts > 0) {
-        LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
-
-        CURLcode res = curl_easy_perform(curl);
-        if (res == CURLE_OK) {
-            return true;
-        }
-
-        int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
-        LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
-
-        remaining_attempts--;
-        std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
-    }
-
-    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
-
-    return false;
-}
-
 static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {

    // Initialize libcurl
    std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
    if (!curl) {
-        LOG_ERR("%s: error initializing libcurl\n", __func__);
+        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
        return false;
    }

@@ -1125,11 +1000,11 @@ static bool llama_download_file(const std::string & url, const std::string & pat
        if (metadata_in.good()) {
            try {
                metadata_in >> metadata;
-                LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
+                fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
                if (metadata.contains("url") && metadata.at("url").is_string()) {
                    auto previous_url = metadata.at("url").get<std::string>();
                    if (previous_url != url) {
-                        LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
+                        fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
                        return false;
                    }
                }
@@ -1140,12 +1015,12 @@ static bool llama_download_file(const std::string & url, const std::string & pat
                    last_modified = metadata.at("lastModified");
                }
            } catch (const nlohmann::json::exception & e) {
-            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+                fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
                return false;
            }
        }
    } else {
-        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
+        fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str());
    }

    // Send a HEAD request to retrieve the etag and last-modified headers
@@ -1182,8 +1057,9 @@ static bool llama_download_file(const std::string & url, const std::string & pat
        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);

-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
-        if (!was_perform_successful) {
+        CURLcode res = curl_easy_perform(curl.get());
+        if (res != CURLE_OK) {
+            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
            return false;
        }

@@ -1193,26 +1069,26 @@ static bool llama_download_file(const std::string & url, const std::string & pat
            // HEAD not supported, we don't know if the file has changed
            // force trigger downloading
            force_download = true;
-            LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
+            fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
        }
    }

    bool should_download = !file_exists || force_download;
    if (!should_download) {
        if (!etag.empty() && etag != headers.etag) {
-            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
+            fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
            should_download = true;
        } else if (!last_modified.empty() && last_modified != headers.last_modified) {
-            LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
+            fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
            should_download = true;
        }
    }
    if (should_download) {
        std::string path_temporary = path + ".downloadInProgress";
        if (file_exists) {
-            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+            fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
            if (remove(path.c_str()) != 0) {
-                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
+                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str());
                return false;
            }
        }
@@ -1227,7 +1103,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat

        std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
        if (!outfile) {
-            LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
+            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
            return false;
        }

@@ -1258,17 +1134,18 @@ static bool llama_download_file(const std::string & url, const std::string & pat
        };

        // start the download
-        LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
-        if (!was_perform_successful) {
+        fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+                llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
+        auto res = curl_easy_perform(curl.get());
+        if (res != CURLE_OK) {
+            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
            return false;
        }

        long http_code = 0;
        curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
        if (http_code < 200 || http_code >= 400) {
-            LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
+            fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
            return false;
        }

@@ -1282,10 +1159,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
            {"lastModified", headers.last_modified}
        });
        std::ofstream(metadata_path) << metadata.dump(4);
-        LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
+        fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str());

        if (rename(path_temporary.c_str(), path.c_str()) != 0) {
-            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
+            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
            return false;
        }
    }
@@ -1300,7 +1177,7 @@ struct llama_model * llama_load_model_from_url(
        const struct llama_model_params & params) {
    // Basic validation of the model_url
    if (!model_url || strlen(model_url) == 0) {
-        LOG_ERR("%s: invalid model_url\n", __func__);
+        fprintf(stderr, "%s: invalid model_url\n", __func__);
        return NULL;
    }

@@ -1317,7 +1194,7 @@ struct llama_model * llama_load_model_from_url(
        };
        auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
        if (!ctx_gguf) {
-            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, path_model);
+            fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, path_model);
            return NULL;
        }

@@ -1337,12 +1214,14 @@ struct llama_model * llama_load_model_from_url(
        // and extract split URL and PATH prefixes
        {
            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
+                fprintf(stderr, "\n%s: unexpected model file name: %s"
+                                " n_split=%d\n", __func__, path_model, n_split);
                return NULL;
            }

            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
+                fprintf(stderr, "\n%s: unexpected model url: %s"
+                                " n_split=%d\n", __func__, model_url, n_split);
                return NULL;
            }
        }
@@ -1402,7 +1281,7 @@ struct llama_model * llama_load_model_from_url(
        const char * /*path_model*/,
        const char * /*hf_token*/,
        const struct llama_model_params & /*params*/) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
+    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
    return nullptr;
 }

@@ -1412,7 +1291,7 @@ struct llama_model * llama_load_model_from_hf(
        const char * /*path_model*/,
        const char * /*hf_token*/,
        const struct llama_model_params & /*params*/) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
    return nullptr;
 }

@@ -1740,13 +1619,13 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
    };
    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
    if (!ctx_gguf) {
-        LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
+        fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
        return result;
    }

    int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
    if (n_tensors == 0) {
-        LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
+        fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
    }

    for (int i = 0; i < n_tensors; i++) {
@@ -1764,23 +1643,23 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
            }
        }
        if (layer_idx < 0) {
-            LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
            result.n_embd = -1;
            break;
        } else if (layer_idx == 0) {
-            LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }

        struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
        if (tensor->type != GGML_TYPE_F32) {
-            LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
+            fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }
        if (ggml_n_dims(tensor) != 1) {
-            LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
+            fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }
@@ -1788,7 +1667,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
        if (result.n_embd == -1) {
            result.n_embd = ggml_nelements(tensor);
        } else if (ggml_nelements(tensor) != result.n_embd) {
-            LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
+            fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }
@@ -1805,7 +1684,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
    }

    if (result.n_embd == -1) {
-        LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
+        fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
        result.data.clear();
    }

@@ -1826,7 +1705,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
            break;
        }
        if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
-            LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
+            fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
            result.n_embd = -1;
            break;
        }
@@ -1842,7 +1721,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
    }

    if (result.n_embd == -1) {
-        LOG_ERR("%s: no valid control vector files passed\n", __func__);
+        fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
        result.data.clear();
    }

@@ -1933,7 +1812,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "cpu_has_sve: %s\n",         ggml_cpu_has_sve()         ? "true" : "false");
    fprintf(stream, "cpu_has_f16c: %s\n",        ggml_cpu_has_f16c()        ? "true" : "false");
    fprintf(stream, "cpu_has_fp16_va: %s\n",     ggml_cpu_has_fp16_va()     ? "true" : "false");
-    fprintf(stream, "cpu_has_riscv_v: %s\n",     ggml_cpu_has_riscv_v()     ? "true" : "false");
    fprintf(stream, "cpu_has_wasm_simd: %s\n",   ggml_cpu_has_wasm_simd()   ? "true" : "false");
    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
    fprintf(stream, "cpu_has_sse3: %s\n",        ggml_cpu_has_sse3()        ? "true" : "false");
--- a/common/common.h
+++ b/common/common.h
@@ -4,9 +4,11 @@

 #include "llama.h"

+#define LOG_NO_FILE_LINE_FUNCTION
+#include "log.h"
+
 #include <string>
 #include <vector>
-#include <sstream>

 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -122,7 +124,6 @@ struct gpt_sampler_params {
    float   mirostat_eta      = 0.10f; // learning rate
    bool    penalize_nl       = false; // consider newlines as a repeatable token
    bool    ignore_eos        = false;
-    bool    no_perf           = false; // disable performance metrics

    std::vector<enum gpt_sampler_type> samplers = {
        GPT_SAMPLER_TYPE_TOP_K,
@@ -245,8 +246,6 @@ struct gpt_params {
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool flash_attn        = false; // flash attention
-    bool no_perf           = false; // disable performance metrics
-    bool ctx_shift         = true;  // context shift on inifinite text generation

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool logits_all        = false; // return logits for all tokens in the batch
@@ -342,10 +341,6 @@ struct gpt_params {
    bool batched_bench_output_jsonl = false;
 };

-// call once at the start of a program if it uses libcommon
-// initializes the logging system and prints info about the build
-void gpt_init();
-
 std::string gpt_params_get_system_info(const gpt_params & params);

 bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
@@ -381,11 +376,6 @@ static std::vector<T> string_split(const std::string & str, char delim) {
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);

-std::string string_from(bool value);
-std::string string_from(const std::vector<int> & values);
-std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
-std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
-
 //
 // Filesystem utils
 //
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -1,401 +0,0 @@
-#include "log.h"
-
-#include <condition_variable>
-#include <cstdarg>
-#include <cstdio>
-#include <mutex>
-#include <sstream>
-#include <thread>
-#include <vector>
-
-int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
-
-void gpt_log_set_verbosity_thold(int verbosity) {
-    gpt_log_verbosity_thold = verbosity;
-}
-
-#define LOG_COL_DEFAULT "\033[0m"
-#define LOG_COL_BOLD    "\033[1m"
-#define LOG_COL_RED     "\033[31m"
-#define LOG_COL_GREEN   "\033[32m"
-#define LOG_COL_YELLOW  "\033[33m"
-#define LOG_COL_BLUE    "\033[34m"
-#define LOG_COL_MAGENTA "\033[35m"
-#define LOG_COL_CYAN    "\033[36m"
-#define LOG_COL_WHITE   "\033[37m"
-
-static int64_t t_us() {
-    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-}
-
-// colors
-enum gpt_log_col : int {
-    GPT_LOG_COL_DEFAULT = 0,
-    GPT_LOG_COL_BOLD,
-    GPT_LOG_COL_RED,
-    GPT_LOG_COL_GREEN,
-    GPT_LOG_COL_YELLOW,
-    GPT_LOG_COL_BLUE,
-    GPT_LOG_COL_MAGENTA,
-    GPT_LOG_COL_CYAN,
-    GPT_LOG_COL_WHITE,
-};
-
-// disable colors by default
-static std::vector<const char *> g_col = {
-    "",
-    "",
-    "",
-    "",
-    "",
-    "",
-    "",
-    "",
-    "",
-};
-
-struct gpt_log_entry {
-    enum ggml_log_level level;
-
-    bool prefix;
-
-    int64_t timestamp;
-
-    std::vector<char> msg;
-
-    // signals the worker thread to stop
-    bool is_end;
-
-    void print(FILE * file = nullptr) const {
-        FILE * fcur = file;
-        if (!fcur) {
-            // stderr displays DBG messages only when their verbosity level is not higher than the threshold
-            // these messages will still be logged to a file
-            if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
-                return;
-            }
-
-            fcur = stdout;
-
-            if (level != GGML_LOG_LEVEL_NONE) {
-                fcur = stderr;
-            }
-        }
-
-        if (level != GGML_LOG_LEVEL_NONE && prefix) {
-            if (timestamp) {
-                // [M.s.ms.us]
-                fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
-                        g_col[GPT_LOG_COL_BLUE],
-                        (int) (timestamp / 1000000 / 60),
-                        (int) (timestamp / 1000000 % 60),
-                        (int) (timestamp / 1000 % 1000),
-                        (int) (timestamp % 1000),
-                        g_col[GPT_LOG_COL_DEFAULT]);
-            }
-
-            switch (level) {
-                case GGML_LOG_LEVEL_INFO:  fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN],   g_col[GPT_LOG_COL_DEFAULT]); break;
-                case GGML_LOG_LEVEL_WARN:  fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], ""                        ); break;
-                case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED],     ""                        ); break;
-                case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW],  ""                        ); break;
-                default:
-                    break;
-            }
-        }
-
-        fprintf(fcur, "%s", msg.data());
-
-        if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
-            fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
-        }
-
-        fflush(fcur);
-    }
-};
-
-struct gpt_log {
-    // default capacity - will be expanded if needed
-    gpt_log() : gpt_log(256) {}
-
-    gpt_log(size_t capacity) {
-        file = nullptr;
-        prefix = false;
-        timestamps = false;
-        running = false;
-        t_start = t_us();
-
-        // initial message size - will be expanded if longer messages arrive
-        entries.resize(capacity);
-        for (auto & entry : entries) {
-            entry.msg.resize(256);
-        }
-
-        head = 0;
-        tail = 0;
-
-        resume();
-    }
-
-    ~gpt_log() {
-        pause();
-        if (file) {
-            fclose(file);
-        }
-    }
-
-private:
-    std::mutex mtx;
-    std::thread thrd;
-    std::condition_variable cv;
-
-    FILE * file;
-
-    bool prefix;
-    bool timestamps;
-    bool running;
-
-    int64_t t_start;
-
-    // ring buffer of entries
-    std::vector<gpt_log_entry> entries;
-    size_t head;
-    size_t tail;
-
-    // worker thread copies into this
-    gpt_log_entry cur;
-
-public:
-    void add(enum ggml_log_level level, const char * fmt, va_list args) {
-        std::lock_guard<std::mutex> lock(mtx);
-
-        if (!running) {
-            // discard messages while the worker thread is paused
-            return;
-        }
-
-        auto & entry = entries[tail];
-
-        {
-            // cannot use args twice, so make a copy in case we need to expand the buffer
-            va_list args_copy;
-            va_copy(args_copy, args);
-
-#if 1
-            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
-            if (n >= entry.msg.size()) {
-                entry.msg.resize(n + 1);
-                vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
-            }
-#else
-            // hack for bolding arguments
-
-            std::stringstream ss;
-            for (int i = 0; fmt[i] != 0; i++) {
-                if (fmt[i] == '%') {
-                    ss << LOG_COL_BOLD;
-                    while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
-                    ss << LOG_COL_DEFAULT;
-                    if (fmt[i] == 0) break;
-                }
-                ss << fmt[i];
-            }
-            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
-            if (n >= entry.msg.size()) {
-                entry.msg.resize(n + 1);
-                vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
-            }
-#endif
-        }
-
-        entry.level = level;
-        entry.prefix = prefix;
-        entry.timestamp = 0;
-        if (timestamps) {
-            entry.timestamp = t_us() - t_start;
-        }
-        entry.is_end = false;
-
-        tail = (tail + 1) % entries.size();
-        if (tail == head) {
-            // expand the buffer
-            std::vector<gpt_log_entry> new_entries(2*entries.size());
-
-            size_t new_tail = 0;
-
-            do {
-                new_entries[new_tail] = std::move(entries[head]);
-
-                head     = (head     + 1) % entries.size();
-                new_tail = (new_tail + 1);
-            } while (head != tail);
-
-            head = 0;
-            tail = new_tail;
-
-            for (size_t i = tail; i < new_entries.size(); i++) {
-                new_entries[i].msg.resize(256);
-            }
-
-            entries = std::move(new_entries);
-        }
-
-        cv.notify_one();
-    }
-
-    void resume() {
-        std::lock_guard<std::mutex> lock(mtx);
-
-        if (running) {
-            return;
-        }
-
-        running = true;
-
-        thrd = std::thread([this]() {
-            while (true) {
-                {
-                    std::unique_lock<std::mutex> lock(mtx);
-                    cv.wait(lock, [this]() { return head != tail; });
-
-                    cur = entries[head];
-
-                    head = (head + 1) % entries.size();
-                }
-
-                if (cur.is_end) {
-                    break;
-                }
-
-                cur.print(); // stdout and stderr
-
-                if (file) {
-                    cur.print(file);
-                }
-            }
-        });
-    }
-
-    void pause() {
-        {
-            std::lock_guard<std::mutex> lock(mtx);
-
-            if (!running) {
-                return;
-            }
-
-            running = false;
-
-            // push an entry to signal the worker thread to stop
-            {
-                auto & entry = entries[tail];
-                entry.is_end = true;
-
-                tail = (tail + 1) % entries.size();
-            }
-
-            cv.notify_one();
-        }
-
-        thrd.join();
-    }
-
-    void set_file(const char * path) {
-        pause();
-
-        if (file) {
-            fclose(file);
-        }
-
-        if (path) {
-            file = fopen(path, "w");
-        } else {
-            file = nullptr;
-        }
-
-        resume();
-    }
-
-    void set_colors(bool colors) {
-        pause();
-
-        if (colors) {
-            g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
-            g_col[GPT_LOG_COL_BOLD]    = LOG_COL_BOLD;
-            g_col[GPT_LOG_COL_RED]     = LOG_COL_RED;
-            g_col[GPT_LOG_COL_GREEN]   = LOG_COL_GREEN;
-            g_col[GPT_LOG_COL_YELLOW]  = LOG_COL_YELLOW;
-            g_col[GPT_LOG_COL_BLUE]    = LOG_COL_BLUE;
-            g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
-            g_col[GPT_LOG_COL_CYAN]    = LOG_COL_CYAN;
-            g_col[GPT_LOG_COL_WHITE]   = LOG_COL_WHITE;
-        } else {
-            for (size_t i = 0; i < g_col.size(); i++) {
-                g_col[i] = "";
-            }
-        }
-
-        resume();
-    }
-
-    void set_prefix(bool prefix) {
-        std::lock_guard<std::mutex> lock(mtx);
-
-        this->prefix = prefix;
-    }
-
-    void set_timestamps(bool timestamps) {
-        std::lock_guard<std::mutex> lock(mtx);
-
-        this->timestamps = timestamps;
-    }
-};
-
-//
-// public API
-//
-
-struct gpt_log * gpt_log_init() {
-    return new gpt_log;
-}
-
-struct gpt_log * gpt_log_main() {
-    static struct gpt_log log;
-
-    return &log;
-}
-
-void gpt_log_pause(struct gpt_log * log) {
-    log->pause();
-}
-
-void gpt_log_resume(struct gpt_log * log) {
-    log->resume();
-}
-
-void gpt_log_free(struct gpt_log * log) {
-    delete log;
-}
-
-void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    log->add(level, fmt, args);
-    va_end(args);
-}
-
-void gpt_log_set_file(struct gpt_log * log, const char * file) {
-    log->set_file(file);
-}
-
-void gpt_log_set_colors(struct gpt_log * log, bool colors) {
-    log->set_colors(colors);
-}
-
-void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
-    log->set_prefix(prefix);
-}
-
-void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
-    log->set_timestamps(timestamps);
-}
--- a/common/log.h
+++ b/common/log.h
@@ -1,90 +1,724 @@
 #pragma once

-#include "ggml.h" // for ggml_log_level
+#include <chrono>
+#include <cstring>
+#include <sstream>
+#include <iostream>
+#include <thread>
+#include <vector>
+#include <algorithm>
+#include <cinttypes>

-#ifndef __GNUC__
-#    define LOG_ATTRIBUTE_FORMAT(...)
-#elif defined(__MINGW32__)
-#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#else
-#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+// --------------------------------
+//
+// Basic usage:
+//
+// --------
+//
+//  The LOG() and LOG_TEE() macros are ready to go by default
+//   they do not require any initialization.
+//
+//  LOGLN() and LOG_TEELN() are variants which automatically
+//   include \n character at the end of the log string.
+//
+//  LOG() behaves exactly like printf, by default writing to a logfile.
+//  LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
+//
+//  Default logfile is named
+//   "llama.<threadID>.log"
+//  Default LOG_TEE() secondary output target is
+//   stderr
+//
+//  Logs can be dynamically disabled or enabled using functions:
+//   log_disable()
+//  and
+//   log_enable()
+//
+//  A log target can be changed with:
+//   log_set_target( string )
+//    creating and opening, or re-opening a file by string filename
+//  or
+//   log_set_target( FILE* )
+//    allowing to point at stderr, stdout, or any valid FILE* file handler.
+//
+// --------
+//
+// End of Basic usage.
+//
+// --------------------------------
+
+// Specifies a log target.
+//  default uses log_handler() with "llama.log" log file
+//  this can be changed, by defining LOG_TARGET
+//  like so:
+//
+//  #define LOG_TARGET (a valid FILE*)
+//  #include "log.h"
+//
+//  or it can be simply redirected to stdout or stderr
+//  like so:
+//
+//  #define LOG_TARGET stderr
+//  #include "log.h"
+//
+//  The log target can also be redirected to a different function
+//  like so:
+//
+//  #define LOG_TARGET log_handler_different()
+//  #include "log.h"
+//
+//  FILE* log_handler_different()
+//  {
+//      return stderr;
+//  }
+//
+//  or:
+//
+//  #define LOG_TARGET log_handler_another_one("somelog.log")
+//  #include "log.h"
+//
+//  FILE* log_handler_another_one(char*filename)
+//  {
+//      static FILE* logfile = nullptr;
+//      (...)
+//      if( !logfile )
+//      {
+//          fopen(...)
+//      }
+//      (...)
+//      return logfile
+//  }
+//
+#ifndef LOG_TARGET
+    #define LOG_TARGET log_handler()
 #endif

-#define LOG_DEFAULT_DEBUG 1
-#define LOG_DEFAULT_LLAMA 0
+#ifndef LOG_TEE_TARGET
+    #define LOG_TEE_TARGET stderr
+#endif

-// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
-// set via gpt_log_set_verbosity()
-extern int gpt_log_verbosity_thold;
+// Utility for synchronizing log configuration state
+//  since std::optional was introduced only in c++17
+enum LogTriState
+{
+    LogTriStateSame,
+    LogTriStateFalse,
+    LogTriStateTrue
+};

-void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
+// Utility to obtain "pid" like unique process id and use it when creating log files.
+inline std::string log_get_pid()
+{
+   static std::string pid;
+   if (pid.empty())
+   {
+       // std::this_thread::get_id() is the most portable way of obtaining a "process id"
+       //  it's not the same as "pid" but is unique enough to solve multiple instances
+       //  trying to write to the same log.
+       std::stringstream ss;
+       ss << std::this_thread::get_id();
+       pid = ss.str();
+   }

-// the gpt_log uses an internal worker thread to print/write log messages
-// when the worker thread is paused, incoming log messages are discarded
-struct gpt_log;
+   return pid;
+}

-struct gpt_log * gpt_log_init();
-struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
-void             gpt_log_pause (struct gpt_log * log); // pause  the worker thread, not thread-safe
-void             gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
-void             gpt_log_free  (struct gpt_log * log);
+// Utility function for generating log file names with unique id based on thread id.
+//  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
+//  where the number is a runtime id of the current thread.

-LOG_ATTRIBUTE_FORMAT(3, 4)
-void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
+#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)

-// defaults: file = NULL, colors = false, prefix = false, timestamps = false
-//
-// regular log output:
-//
-//   ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
-//   llm_load_tensors: ggml ctx size =    0.27 MiB
-//   llm_load_tensors: offloading 32 repeating layers to GPU
-//   llm_load_tensors: offloading non-repeating layers to GPU
-//
-// with prefix = true, timestamps = true, the log output will look like this:
-//
-//   0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
-//   0.00.035.064 I llm_load_tensors: ggml ctx size =    0.27 MiB
-//   0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
-//   0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
-//
-// I - info    (stdout, V = 0)
-// W - warning (stderr, V = 0)
-// E - error   (stderr, V = 0)
-// D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
-//
+// INTERNAL, DO NOT USE
+inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
+{
+    static bool _multilog = false;

-void gpt_log_set_file      (struct gpt_log * log, const char * file);       // not thread-safe
-void gpt_log_set_colors    (struct gpt_log * log,       bool   colors);     // not thread-safe
-void gpt_log_set_prefix    (struct gpt_log * log,       bool   prefix);     // whether to output prefix to each log
-void gpt_log_set_timestamps(struct gpt_log * log,       bool   timestamps); // whether to output timestamps in the prefix
+    if (multilog != LogTriStateSame)
+    {
+        _multilog = multilog == LogTriStateTrue;
+    }

-// helper macros for logging
-// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
-//
-// for example:
-//
-//   LOG_DBG("this is a debug message: %d\n", expensive_function());
-//
-// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
-//
+    std::stringstream buf;

-#define LOG_TMPL(level, verbosity, ...) \
-    do { \
-        if ((verbosity) <= gpt_log_verbosity_thold) { \
-            gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
-        } \
+    buf << log_file_basename;
+    if (_multilog)
+    {
+        buf << ".";
+        buf << log_get_pid();
+    }
+    buf << ".";
+    buf << log_file_extension;
+
+    return buf.str();
+}
+
+#ifndef LOG_DEFAULT_FILE_NAME
+    #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
+#endif
+
+// Utility for turning #define values into string literals
+//  so we can have a define for stderr and
+//  we can print "stderr" instead of literal stderr, etc.
+#define LOG_STRINGIZE1(s) #s
+#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
+
+#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
+
+// Allows disabling timestamps.
+//  in order to disable, define LOG_NO_TIMESTAMPS
+//  like so:
+//
+//  #define LOG_NO_TIMESTAMPS
+//  #include "log.h"
+//
+#ifndef LOG_NO_TIMESTAMPS
+    #ifndef _MSC_VER
+        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
+        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
+    #else
+        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
+        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
+    #endif
+#else
+    #define LOG_TIMESTAMP_FMT "%s"
+    #define LOG_TIMESTAMP_VAL ,""
+#endif
+
+#ifdef LOG_TEE_TIMESTAMPS
+    #ifndef _MSC_VER
+        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
+        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
+    #else
+        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
+        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
+    #endif
+#else
+    #define LOG_TEE_TIMESTAMP_FMT "%s"
+    #define LOG_TEE_TIMESTAMP_VAL ,""
+#endif
+
+// Allows disabling file/line/function prefix
+//  in order to disable, define LOG_NO_FILE_LINE_FUNCTION
+//  like so:
+//
+//  #define LOG_NO_FILE_LINE_FUNCTION
+//  #include "log.h"
+//
+#ifndef LOG_NO_FILE_LINE_FUNCTION
+    #ifndef _MSC_VER
+        #define LOG_FLF_FMT "[%24s:%5d][%24s] "
+        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
+    #else
+        #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
+        #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
+    #endif
+#else
+    #define LOG_FLF_FMT "%s"
+    #define LOG_FLF_VAL ,""
+#endif
+
+#ifdef LOG_TEE_FILE_LINE_FUNCTION
+    #ifndef _MSC_VER
+        #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
+        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
+    #else
+        #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
+        #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
+    #endif
+#else
+    #define LOG_TEE_FLF_FMT "%s"
+    #define LOG_TEE_FLF_VAL ,""
+#endif
+
+// INTERNAL, DO NOT USE
+//  USE LOG() INSTEAD
+//
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
+    #define LOG_IMPL(str, ...)                                                                                      \
+    do {                                                                                                            \
+        if (LOG_TARGET != nullptr)                                                                                  \
+        {                                                                                                           \
+            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
+            fflush(LOG_TARGET);                                                                                     \
+        }                                                                                                           \
    } while (0)
+#else
+    #define LOG_IMPL(str, ...)                                                                                           \
+    do {                                                                                                                 \
+        if (LOG_TARGET != nullptr)                                                                                       \
+        {                                                                                                                \
+            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
+            fflush(LOG_TARGET);                                                                                          \
+        }                                                                                                                \
+    } while (0)
+#endif

-#define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, 0,         __VA_ARGS__)
-#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
+// INTERNAL, DO NOT USE
+//  USE LOG_TEE() INSTEAD
+//
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
+    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
+    do {                                                                                                                                \
+        if (LOG_TARGET != nullptr)                                                                                                      \
+        {                                                                                                                               \
+            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__);                     \
+            fflush(LOG_TARGET);                                                                                                         \
+        }                                                                                                                               \
+        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                         \
+        {                                                                                                                               \
+            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
+            fflush(LOG_TEE_TARGET);                                                                                                     \
+        }                                                                                                                               \
+    } while (0)
+#else
+    #define LOG_TEE_IMPL(str, ...)                                                                                                           \
+    do {                                                                                                                                     \
+        if (LOG_TARGET != nullptr)                                                                                                           \
+        {                                                                                                                                    \
+            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__);                     \
+            fflush(LOG_TARGET);                                                                                                              \
+        }                                                                                                                                    \
+        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                              \
+        {                                                                                                                                    \
+            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
+            fflush(LOG_TEE_TARGET);                                                                                                          \
+        }                                                                                                                                    \
+    } while (0)
+#endif

-#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  0,                 __VA_ARGS__)
-#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
-#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
-#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
+// The '\0' as a last argument, is a trick to bypass the silly
+//  "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
+//  so we can have a single macro which can be called just like printf.

-#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
-#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
-#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
-#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
+// Main LOG macro.
+//  behaves like printf, and supports arguments the exact same way.
+//
+#if !defined(_MSC_VER) || defined(__clang__)
+    #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
+#else
+    #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
+#endif
+
+// Main TEE macro.
+//  does the same as LOG
+//  and
+//  simultaneously writes stderr.
+//
+// Secondary target can be changed just like LOG_TARGET
+//  by defining LOG_TEE_TARGET
+//
+#if !defined(_MSC_VER) || defined(__clang__)
+    #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
+#else
+    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
+#endif
+
+// LOG macro variants with auto endline.
+#if !defined(_MSC_VER) || defined(__clang__)
+    #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
+    #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
+#else
+    #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
+    #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
+#endif
+
+// INTERNAL, DO NOT USE
+inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
+{
+    static bool _initialized = false;
+    static bool _append = false;
+    static bool _disabled = filename.empty() && target == nullptr;
+    static std::string log_current_filename{filename};
+    static FILE *log_current_target{target};
+    static FILE *logfile = nullptr;
+
+    if (change)
+    {
+        if (append != LogTriStateSame)
+        {
+            _append = append == LogTriStateTrue;
+            return logfile;
+        }
+
+        if (disable == LogTriStateTrue)
+        {
+            // Disable primary target
+            _disabled = true;
+        }
+        // If previously disabled, only enable, and keep previous target
+        else if (disable == LogTriStateFalse)
+        {
+            _disabled = false;
+        }
+        // Otherwise, process the arguments
+        else if (log_current_filename != filename || log_current_target != target)
+        {
+            _initialized = false;
+        }
+    }
+
+    if (_disabled)
+    {
+        // Log is disabled
+        return nullptr;
+    }
+
+    if (_initialized)
+    {
+        // with fallback in case something went wrong
+        return logfile ? logfile : stderr;
+    }
+
+    // do the (re)initialization
+    if (target != nullptr)
+    {
+        if (logfile != nullptr && logfile != stdout && logfile != stderr)
+        {
+            fclose(logfile);
+        }
+
+        log_current_filename = LOG_DEFAULT_FILE_NAME;
+        log_current_target = target;
+
+        logfile = target;
+    }
+    else
+    {
+        if (log_current_filename != filename)
+        {
+            if (logfile != nullptr && logfile != stdout && logfile != stderr)
+            {
+                fclose(logfile);
+            }
+        }
+
+        logfile = fopen(filename.c_str(), _append ? "a" : "w");
+    }
+
+    if (!logfile)
+    {
+        //  Verify whether the file was opened, otherwise fallback to stderr
+        logfile = stderr;
+
+        fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
+        fflush(stderr);
+
+        // At this point we let the init flag be to true below, and let the target fallback to stderr
+        //  otherwise we would repeatedly fopen() which was already unsuccessful
+    }
+
+    _initialized = true;
+
+    return logfile ? logfile : stderr;
+}
+
+// INTERNAL, DO NOT USE
+inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
+{
+    return log_handler1_impl(change, append, disable, filename, target);
+}
+
+// Disables logs entirely at runtime.
+//  Makes LOG() and LOG_TEE() produce no output,
+//  until enabled back.
+#define log_disable() log_disable_impl()
+
+// INTERNAL, DO NOT USE
+inline FILE *log_disable_impl()
+{
+    return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
+}
+
+// Enables logs at runtime.
+#define log_enable() log_enable_impl()
+
+// INTERNAL, DO NOT USE
+inline FILE *log_enable_impl()
+{
+    return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
+}
+
+// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
+#define log_set_target(target) log_set_target_impl(target)
+
+// INTERNAL, DO NOT USE
+inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
+inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
+
+// INTERNAL, DO NOT USE
+inline FILE *log_handler() { return log_handler1_impl(); }
+
+// Enable or disable creating separate log files for each run.
+//  can ONLY be invoked BEFORE first log use.
+#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
+// Enable or disable append mode for log file.
+//  can ONLY be invoked BEFORE first log use.
+#define log_append(enable) log_append_impl(enable)
+// INTERNAL, DO NOT USE
+inline FILE *log_append_impl(bool enable)
+{
+    return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
+}
+
+inline void log_test()
+{
+    log_disable();
+    LOG("01 Hello World to nobody, because logs are disabled!\n");
+    log_enable();
+    LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
+    LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
+    log_set_target(stderr);
+    LOG("04 Hello World to stderr!\n");
+    LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
+    log_set_target(LOG_DEFAULT_FILE_NAME);
+    LOG("06 Hello World to default log file!\n");
+    log_set_target(stdout);
+    LOG("07 Hello World to stdout!\n");
+    log_set_target(LOG_DEFAULT_FILE_NAME);
+    LOG("08 Hello World to default log file again!\n");
+    log_disable();
+    LOG("09 Hello World _1_ into the void!\n");
+    log_enable();
+    LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
+    log_disable();
+    log_set_target("llama.anotherlog.log");
+    LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
+    log_enable();
+    LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
+    log_set_target("llama.yetanotherlog.log");
+    LOG("13 Hello World this time in yet new file?\n");
+    log_set_target(log_filename_generator("llama_autonamed", "log"));
+    LOG("14 Hello World in log with generated filename!\n");
+#ifdef _MSC_VER
+    LOG_TEE("15 Hello msvc TEE without arguments\n");
+    LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
+    LOG_TEELN("17 Hello msvc TEELN without arguments\n");
+    LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
+    LOG("19 Hello msvc LOG without arguments\n");
+    LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
+    LOGLN("21 Hello msvc LOGLN without arguments\n");
+    LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
+#endif
+}
+
+inline bool log_param_single_parse(const std::string & param)
+{
+    if ( param == "--log-test")
+    {
+        log_test();
+        return true;
+    }
+
+    if ( param == "--log-disable")
+    {
+        log_disable();
+        return true;
+    }
+
+    if ( param == "--log-enable")
+    {
+        log_enable();
+        return true;
+    }
+
+    if (param == "--log-new")
+    {
+        log_multilog(true);
+        return true;
+    }
+
+    if (param == "--log-append")
+    {
+        log_append(true);
+        return true;
+    }
+
+    return false;
+}
+
+inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
+{
+    if ( param == "--log-file")
+    {
+        if (!check_but_dont_parse)
+        {
+            log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
+        }
+
+        return true;
+    }
+
+    return false;
+}
+
+inline void log_print_usage()
+{
+    printf("log options:\n");
+    /* format
+    printf("  -h, --help            show this help message and exit\n");*/
+    /* spacing
+    printf("__-param----------------Description\n");*/
+    printf("  --log-test            Run simple logging test\n");
+    printf("  --log-disable         Disable trace logs\n");
+    printf("  --log-enable          Enable trace logs\n");
+    printf("  --log-file            Specify a log filename (without extension)\n");
+    printf("  --log-new             Create a separate new log file on start. "
+                                   "Each log file will have unique name: \"<name>.<ID>.log\"\n");
+    printf("  --log-append          Don't truncate the old log file.\n");
+    printf("\n");
+}
+
+#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
+
+// INTERNAL, DO NOT USE
+inline void log_dump_cmdline_impl(int argc, char **argv)
+{
+    std::stringstream buf;
+    for (int i = 0; i < argc; ++i)
+    {
+        if (std::string(argv[i]).find(' ') != std::string::npos)
+        {
+            buf << " \"" << argv[i] <<"\"";
+        }
+        else
+        {
+            buf << " " << argv[i];
+        }
+    }
+    LOGLN("Cmd:%s", buf.str().c_str());
+}
+
+#define log_tostr(var) log_var_to_string_impl(var).c_str()
+
+inline std::string log_var_to_string_impl(bool var)
+{
+    return var ? "true" : "false";
+}
+
+inline std::string log_var_to_string_impl(std::string var)
+{
+    return var;
+}
+
+inline std::string log_var_to_string_impl(const std::vector<int> & var)
+{
+    std::stringstream buf;
+    buf << "[ ";
+    bool first = true;
+    for (auto e : var)
+    {
+        if (first)
+        {
+            first = false;
+        }
+        else
+        {
+            buf << ", ";
+        }
+        buf << std::to_string(e);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
+
+template <typename C, typename T>
+inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
+{
+    std::stringstream buf;
+    buf << "[ ";
+
+    bool first = true;
+    for (const auto & token : tokens)
+    {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = llama_token_to_piece(ctx, token);
+
+        detokenized.erase(
+            std::remove_if(
+                detokenized.begin(),
+                detokenized.end(),
+                [](const unsigned char c) { return !std::isprint(c); }),
+            detokenized.end());
+
+        buf
+            << "'" << detokenized << "'"
+            << ":" << std::to_string(token);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
+
+template <typename C, typename B>
+inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
+{
+    std::stringstream buf;
+    buf << "[ ";
+
+    bool first = true;
+    for (int i = 0; i < batch.n_tokens; ++i)
+    {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
+
+        detokenized.erase(
+            std::remove_if(
+                detokenized.begin(),
+                detokenized.end(),
+                [](const unsigned char c) { return !std::isprint(c); }),
+            detokenized.end());
+
+        buf
+            << "\n" << std::to_string(i)
+            << ":token '" << detokenized << "'"
+            << ":pos " << std::to_string(batch.pos[i])
+            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
+            << ":seq_id " << std::to_string(batch.seq_id[i][0])
+            << ":logits " << std::to_string(batch.logits[i]);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
+
+#ifdef LOG_DISABLE_LOGS
+
+#undef LOG
+#define LOG(...) // dummy stub
+#undef LOGLN
+#define LOGLN(...) // dummy stub
+
+#undef LOG_TEE
+#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
+
+#undef LOG_TEELN
+#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
+
+#undef LOG_DISABLE
+#define LOG_DISABLE() // dummy stub
+
+#undef LOG_ENABLE
+#define LOG_ENABLE() // dummy stub
+
+#undef LOG_ENABLE
+#define LOG_ENABLE() // dummy stub
+
+#undef LOG_SET_TARGET
+#define LOG_SET_TARGET(...) // dummy stub
+
+#undef LOG_DUMP_CMDLINE
+#define LOG_DUMP_CMDLINE(...) // dummy stub
+
+#endif // LOG_DISABLE_LOGS
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -2,11 +2,8 @@
 #include "common.h"
 #include "log.h"

-#include <cinttypes>
 #include <cstdint>
-#include <cstdio>
 #include <fstream>
-#include <thread>

 void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
                              std::vector<llama_token> & inp, int nnew, bool print_progress) {
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -142,7 +142,7 @@ std::string gpt_sampler_params::print() const {
 struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();

-    lparams.no_perf = params.no_perf;
+    lparams.no_perf = false; // TODO: control via params

    auto * result = new gpt_sampler {
        /* .params = */ params,
@@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
    // TODO: measure grammar performance

    if (gsmpl) {
-        llama_perf_sampler_print(gsmpl->chain);
+        llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
    }
    if (ctx) {
-        llama_perf_context_print(ctx);
+        llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
    }
 }

@@ -310,10 +310,6 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
    return cur_p.data[cur_p.selected].id;
 }

-uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
-    return llama_sampler_get_seed(gsmpl->chain);
-}
-
 // helpers

 llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
@@ -325,7 +321,7 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
 }

 std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
-    std::string result = "logits ";
+    std::string result = "\tlogits ";

    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
        const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -60,8 +60,6 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
 //
 llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);

-uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
-
 // helpers

 // access the internal list of current candidate tokens
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1,11 +1,9 @@
 #include "train.h"
 #include "common.h"

-#include <algorithm>
 #include <random>
 #include <sstream>
 #include <functional>
-#include <cstring>

 struct random_normal_distribution {
    std::mt19937 gen;
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -626,9 +626,6 @@ class Model:
        if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
            res = "exaone"
-        if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
-            # ref: https://huggingface.co/microsoft/phi-2
-            res = "phi-2"

        if res is None:
            logger.warning("\n")
@@ -1487,7 +1484,7 @@ class StableLMModel(Model):
                raise ValueError(f"Unprocessed norms: {norms}")


-@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
+@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
 class LlamaModel(Model):
    model_arch = gguf.MODEL_ARCH.LLAMA

@@ -2774,8 +2771,6 @@ class Rwkv6Model(Model):
        self.gguf_writer.add_tokenizer_model("rwkv")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
-        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
-        special_vocab.add_to_gguf(self.gguf_writer)

    def set_gguf_parameters(self):
        block_count = self.hparams["num_hidden_layers"]
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -31,7 +31,6 @@ import re
 import requests
 import sys
 import json
-import shutil

 from hashlib import sha256
 from enum import IntEnum, auto
@@ -98,7 +97,6 @@ models = [
    {'name': "bloom",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
    {'name': "gpt3-finnish",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
    {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
-    {"name": "phi-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
 ]


@@ -127,27 +125,12 @@ def download_model(model):
    if tokt == TOKENIZER_TYPE.UGM:
        files.append("spiece.model")

-    if os.path.isdir(repo):
-        # If repo is a path on the file system, copy the directory
-        for file in files:
-            src_path = os.path.join(repo, file)
-            dst_path = f"models/tokenizers/{name}/{file}"
-            if os.path.isfile(dst_path):
-                logger.info(f"{name}: File {dst_path} already exists - skipping")
-                continue
-            if os.path.isfile(src_path):
-                shutil.copy2(src_path, dst_path)
-                logger.info(f"{name}: Copied {src_path} to {dst_path}")
-            else:
-                logger.warning(f"{name}: Source file {src_path} does not exist")
-    else:
-        # If repo is a URL, download the files
-        for file in files:
-            save_path = f"models/tokenizers/{name}/{file}"
-            if os.path.isfile(save_path):
-                logger.info(f"{name}: File {save_path} already exists - skipping")
-                continue
-            download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
+    for file in files:
+        save_path = f"models/tokenizers/{name}/{file}"
+        if os.path.isfile(save_path):
+            logger.info(f"{name}: File {save_path} already exists - skipping")
+            continue
+        download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)


 for model in models:
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -363,13 +363,7 @@ if __name__ == '__main__':
                    yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))

            def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-                dest = list(super().modify_tensors(data_torch, name, bid))
-                # some archs may have the same tensor for lm_head and output (tie word embeddings)
-                # in this case, adapters targeting lm_head will fail when using llama-export-lora
-                # therefore, we ignore them for now
-                # see: https://github.com/ggerganov/llama.cpp/issues/9065
-                if name == "lm_head.weight" and len(dest) == 0:
-                    raise ValueError("lm_head is present in adapter, but is ignored in base model")
+                dest = super().modify_tensors(data_torch, name, bid)
                for dest_name, dest_data in dest:
                    assert isinstance(dest_data, LoraTorchTensor)
                    lora_a, lora_b = dest_data.get_lora_A_B()
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -17,17 +17,6 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
 constexpr float rms_norm_eps = 5e-6f;
 #endif

-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
 static struct ggml_tensor * randomize_tensor(
    struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
 ) {
@@ -1514,8 +1503,6 @@ int main(int argc, char ** argv) {
    int n_tokens = model.hparams.n_ctx;
    int n_vocab  = model.hparams.n_vocab;

-    std::vector<uint8_t> work_buffer;
-
    for (int ex=0; ex<n_examples; ++ex) {
        struct ggml_init_params params = {
            /*.mem_size   =*/ compute_size,
@@ -1542,7 +1529,10 @@ int main(int argc, char ** argv) {
        struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);

        ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
+        ggml_graph_prepare(gf, 1, nullptr);
+        ggml_graph_work_init(gf, nullptr);
+        ggml_graph_compute(gf);
+        ggml_graph_work_free(gf);

        float error_before_opt = ggml_get_f32_1d(e, 0);

@@ -1553,7 +1543,10 @@ int main(int argc, char ** argv) {
        ggml_opt(ctx0, opt_params_lbfgs, e);
        //
        ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
+        ggml_graph_prepare(gf, 1, nullptr);
+        ggml_graph_work_init(gf, nullptr);
+        ggml_graph_compute(gf);
+        ggml_graph_work_free(gf);

        float error_after_opt = ggml_get_f32_1d(e, 0);

@@ -1607,7 +1600,10 @@ int main(int argc, char ** argv) {
            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);

            ggml_build_forward_expand(gf, logits);
-            ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
+            ggml_graph_prepare(gf, 1, nullptr);
+            ggml_graph_work_init(gf, nullptr);
+            ggml_graph_compute(gf);
+            ggml_graph_work_free(gf);

            struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
            struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -1,17 +1,38 @@
 #include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

 #include <algorithm>
+#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>

+// mutates the input string
+static std::vector<int> parse_list(char * p) {
+    std::vector<int> ret;
+
+    char * q = p;
+
+    while (*p) {
+        if (*p == ',') {
+            *p = '\0';
+            ret.push_back(std::atoi(q));
+            q = p + 1;
+        }
+
+        ++p;
+    }
+
+    ret.push_back(std::atoi(q));
+
+    return ret;
+}
+
 static void print_usage(int, char ** argv) {
-    LOG("\nexample usage:\n");
-    LOG("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
-    LOG("\n");
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
+    LOG_TEE("\n");
 }

 int main(int argc, char ** argv) {
@@ -21,8 +42,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    gpt_init();
-
    int is_pp_shared = params.is_pp_shared;

    std::vector<int> n_pp = params.n_pp;
@@ -79,7 +98,7 @@ int main(int argc, char ** argv) {

            const int ret = llama_decode(ctx, batch_view);
            if (ret != 0) {
-                LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
                return false;
            }

@@ -96,17 +115,17 @@ int main(int argc, char ** argv) {
        }

        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
            return 1;
        }
    }

    if (!params.batched_bench_output_jsonl) {
-        LOG("\n");
-        LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
-        LOG("\n");
-        LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
-        LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
+        LOG_TEE("\n");
+        LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+        LOG_TEE("\n");
+        LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
+        LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
    }

    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
@@ -136,7 +155,7 @@ int main(int argc, char ** argv) {
                llama_kv_cache_clear(ctx);

                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                    LOG_ERR("%s: llama_decode() failed\n", __func__);
+                    LOG_TEE("%s: llama_decode() failed\n", __func__);
                    return 1;
                }

@@ -158,7 +177,7 @@ int main(int argc, char ** argv) {
                    }

                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                        LOG_ERR("%s: llama_decode() failed\n", __func__);
+                        LOG_TEE("%s: llama_decode() failed\n", __func__);
                        return 1;
                    }
                }
@@ -176,21 +195,21 @@ int main(int argc, char ** argv) {
                const float speed    = n_kv / t;

                if(params.batched_bench_output_jsonl) {
-                    LOG(
+                    LOG_TEE(
                        "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
                        "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
                        n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
                        pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
                    );
                } else {
-                    LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
+                    LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
                }
            }
        }
    }

-    LOG("\n");
-    llama_perf_context_print(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    llama_batch_free(batch);

@@ -199,7 +218,7 @@ int main(int argc, char ** argv) {

    llama_backend_free();

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    return 0;
 }
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -200,8 +200,8 @@ let t_main_end = ggml_time_us()

 print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")

-llama_perf_sampler_print(smpl)
-llama_perf_context_print(context)
+llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT)
+llama_perf_print(UnsafeRawPointer(smpl),    LLAMA_PERF_TYPE_SAMPLER_CHAIN)

 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
    let utf8Count = text.utf8.count
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -1,6 +1,5 @@
 #include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

 #include <algorithm>
@@ -9,9 +8,9 @@
 #include <vector>

 static void print_usage(int, char ** argv) {
-    LOG("\nexample usage:\n");
-    LOG("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
-    LOG("\n");
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
+    LOG_TEE("\n");
 }

 int main(int argc, char ** argv) {
@@ -24,7 +23,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    gpt_init();

    // number of parallel batches
    int n_parallel = params.n_parallel;
@@ -44,7 +42,7 @@ int main(int argc, char ** argv) {
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

    if (model == NULL) {
-        LOG_ERR("%s: error: unable to load model\n" , __func__);
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

@@ -74,29 +72,31 @@ int main(int argc, char ** argv) {
    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));

    if (ctx == NULL) {
-        LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }

    const int n_ctx = llama_n_ctx(ctx);

-    LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);

    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx) {
-        LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
-        LOG_ERR("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+        LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
+        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
        return 1;
    }

    // print the prompt token-by-token

-    LOG("\n");
+    fprintf(stderr, "\n");

    for (auto id : tokens_list) {
-        LOG("%s", llama_token_to_piece(ctx, id).c_str());
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
    }

+    fflush(stderr);
+
    // create a llama_batch
    // we use this object to submit token data for decoding
    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
@@ -114,7 +114,7 @@ int main(int argc, char ** argv) {

    if (llama_model_has_encoder(model)) {
        if (llama_encode(ctx, batch)) {
-            LOG_ERR("%s : failed to eval\n", __func__);
+            LOG_TEE("%s : failed to eval\n", __func__);
            return 1;
        }

@@ -131,7 +131,7 @@ int main(int argc, char ** argv) {
    batch.logits[batch.n_tokens - 1] = true;

    if (llama_decode(ctx, batch) != 0) {
-        LOG_ERR("%s: llama_decode() failed\n", __func__);
+        LOG_TEE("%s: llama_decode() failed\n", __func__);
        return 1;
    }

@@ -142,7 +142,7 @@ int main(int argc, char ** argv) {
    //}

    if (n_parallel > 1) {
-        LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+        LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
    }

    // main loop
@@ -175,9 +175,9 @@ int main(int argc, char ** argv) {
            // is it an end of generation? -> mark the stream as finished
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                i_batch[i] = -1;
-                LOG("\n");
+                LOG_TEE("\n");
                if (n_parallel > 1) {
-                    LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
+                    LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
                }

                continue;
@@ -185,7 +185,8 @@ int main(int argc, char ** argv) {

            // if there is only one stream, we print immediately to stdout
            if (n_parallel == 1) {
-                LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+                LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+                fflush(stdout);
            }

            streams[i] += llama_token_to_piece(ctx, new_token_id);
@@ -207,27 +208,29 @@ int main(int argc, char ** argv) {

        // evaluate the current batch with the transformer model
        if (llama_decode(ctx, batch)) {
-            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }
    }

+    LOG_TEE("\n");
+
    if (n_parallel > 1) {
-        LOG("\n");
+        LOG_TEE("\n");

        for (int32_t i = 0; i < n_parallel; ++i) {
-            LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
+            LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
        }
    }

    const auto t_main_end = ggml_time_us();

-    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    LOG("\n");
-    llama_perf_sampler_print(smpl);
-    llama_perf_context_print(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);

    fprintf(stderr, "\n");

--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -20,17 +20,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
 static float tensor_sum_elements(const ggml_tensor * tensor) {
    double sum = 0;
    if (tensor->type == GGML_TYPE_F32) {
@@ -179,9 +168,8 @@ int main(int argc, char ** argv)  {
    TENSOR_DUMP(m11);
    TENSOR_DUMP(m2);

-    std::vector<uint8_t> work_buffer;
-
-    ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
+    ggml_graph_prepare(gf, benchmark_params.n_threads, nullptr);
+    ggml_graph_work_init(gf, nullptr);

    TENSOR_DUMP(ggml_graph_node(gf, 0));

@@ -234,7 +222,7 @@ int main(int argc, char ** argv)  {

        long long int start = ggml_time_us();
        //printf("Running ggml_graph_compute\n");
-        ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
+        ggml_graph_compute(gf31);

        long long int stop = ggml_time_us();
        long long int usec = stop-start;
@@ -267,8 +255,11 @@ int main(int argc, char ** argv)  {
        }

        // Running a different graph computation to make sure we override the CPU cache lines
-        ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
+        ggml_graph_compute(gf32);
    }
+
+    ggml_graph_work_free(gf);
+
    printf("\n");
    printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
    printf("=====================================================================================\n");
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -9,7 +9,6 @@
 #include <climits>
 #include <cstring>
 #include <cstdarg>
-#include <cinttypes>
 #include <ctime>
 #include <random>
 #include <stdexcept>
@@ -106,43 +105,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_
    const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
    try {
        w->token_embedding_table.resize(p->vocab_size * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);

        w->rms_att_weight.resize(p->n_layers * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);

        w->rms_ffn_weight.resize(p->n_layers * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);

        w->wq.resize(p->n_layers * p->dim * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

        w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);

        w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);

        w->wo.resize(p->n_layers * p->dim * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

        w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);

        w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);

        w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);

        w->rms_final_weight.resize(p->dim);
-        LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+        LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);

        if (shared_weights) {
            w->wcls = {};
        } else {
            w->wcls.resize(p->vocab_size * p->dim);
-            LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+            LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
        }
    }
    catch (std::length_error &) {
@@ -174,7 +173,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
    fseek(f, 0, SEEK_END);
    auto end = ftell(f);
    if (curr != end) {
-        LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
+        LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
        return 1;
    }

@@ -182,20 +181,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
 }

 static void print_sample_weights(TransformerWeights *w){
-    LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
-    LOG_INF("%f\n", w->token_embedding_table[0]);
-    LOG_INF("%f\n", w->rms_att_weight[0]);
-    LOG_INF("%f\n", w->rms_ffn_weight[0]);
+    LOG("----- Quick print of first of the weight vales of all the variables\n");
+    LOG("%f\n", w->token_embedding_table[0]);
+    LOG("%f\n", w->rms_att_weight[0]);
+    LOG("%f\n", w->rms_ffn_weight[0]);

-    LOG_INF("%f\n", w->wq[0]);
-    LOG_INF("%f\n", w->wk[0]);
-    LOG_INF("%f\n", w->wv[0]);
-    LOG_INF("%f\n", w->wo[0]);
-    LOG_INF("%f\n", w->w1[0]);
-    LOG_INF("%f\n", w->w2[0]);
-    LOG_INF("%f\n", w->w3[0]);
-    LOG_INF("%f\n", w->rms_att_weight[0]);
-    if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
+    LOG("%f\n", w->wq[0]);
+    LOG("%f\n", w->wk[0]);
+    LOG("%f\n", w->wv[0]);
+    LOG("%f\n", w->wo[0]);
+    LOG("%f\n", w->w1[0]);
+    LOG("%f\n", w->w2[0]);
+    LOG("%f\n", w->w3[0]);
+    LOG("%f\n", w->rms_att_weight[0]);
+    if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////

@@ -319,20 +318,20 @@ struct train_params {
 };

 static void print_params(struct my_llama_hparams * params) {
-    LOG_INF("%s: n_vocab:   %u\n", __func__, params->n_vocab);
-    LOG_INF("%s: n_ctx:     %u\n", __func__, params->n_ctx);
-    LOG_INF("%s: n_embd:    %u\n", __func__, params->n_embd);
-    LOG_INF("%s: n_mult:    %u\n", __func__, params->n_mult);
-    LOG_INF("%s: n_head:    %u\n", __func__, params->n_head);
-    LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
-    LOG_INF("%s: n_ff:      %u\n", __func__, params->n_ff);
-    LOG_INF("%s: n_layer:   %u\n", __func__, params->n_layer);
-    LOG_INF("%s: n_rot:     %u\n", __func__, params->n_rot);
+    LOG("%s: n_vocab:   %u\n", __func__, params->n_vocab);
+    LOG("%s: n_ctx:     %u\n", __func__, params->n_ctx);
+    LOG("%s: n_embd:    %u\n", __func__, params->n_embd);
+    LOG("%s: n_mult:    %u\n", __func__, params->n_mult);
+    LOG("%s: n_head:    %u\n", __func__, params->n_head);
+    LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
+    LOG("%s: n_ff:      %u\n", __func__, params->n_ff);
+    LOG("%s: n_layer:   %u\n", __func__, params->n_layer);
+    LOG("%s: n_rot:     %u\n", __func__, params->n_rot);
 }

 static void print_tensor_info(const struct ggml_context * ctx) {
    for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-        LOG_INF("%s: Allocating ", __func__);
+        LOG("%s: Allocating ", __func__);
        int64_t total = 1;
        int i = 0;
        for (; i < ggml_n_dims(t); ++i) {
@@ -527,7 +526,7 @@ static std::string llama_escape_whitespaces(const std::string & text) {

 static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
    if (is_ggml_file(filename)) {
-        LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
+        LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
        struct ggml_context * ctx_data = NULL;

        struct gguf_init_params params = {
@@ -575,7 +574,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
        gguf_free(ctx);
    } else {
        // assume llama2.c vocabulary
-        LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
+        LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
        llama_file file(filename, "rb");
        if (!file.fp) {
            die_fmt("%s: %s", strerror(errno), filename);
@@ -872,25 +871,23 @@ static std::string basename(const std::string &path) {
 }

 int main(int argc, char ** argv) {
-    gpt_init();
-
    struct train_params params = get_default_train_params();
    if (!params_parse(argc, argv, &params)) {
        return 1;
    }
-
+    log_set_target(stdout);
    Config config;
    TransformerWeights weights = {};
    {
-        LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
+        LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
        FILE * file = fopen(params.fn_llama2c_model, "rb");
        if (!file) {
-            LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
+            LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
            return 1;
        }
        // read in the config header
        if (fread(&config, sizeof(Config), 1, file) != 1) {
-            LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
+            LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
            return 1;
        }
        auto shared_weights = config.vocab_size > 0;
@@ -899,7 +896,7 @@ int main(int argc, char ** argv) {
        // read in the Transformer weights
        alloc_weights(&weights, &config, shared_weights);
        if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
-            LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
+            LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
            return 1;
        }
        fclose(file);
@@ -932,7 +929,7 @@ int main(int argc, char ** argv) {
    model.name = basename(params.fn_llama2c_model);
    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);

-    LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
+    LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);

    ggml_free(model.ctx);
    return 0;
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -13,15 +13,14 @@
 #include "ggml-metal.h"
 #endif

-#include <algorithm>
-#include <climits>
 #include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <iostream>
 #include <string>
 #include <tuple>
 #include <vector>
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <climits>


 //////////////////////////////////////////////////
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,6 +1,5 @@
 #include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

 #include <ctime>
@@ -40,16 +39,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
    llama_kv_cache_clear(ctx);

    // run model
-    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
    if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
        // encoder-only model
        if (llama_encode(ctx, batch) < 0) {
-            LOG_ERR("%s : failed to encode\n", __func__);
+            fprintf(stderr, "%s : failed to encode\n", __func__);
        }
    } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
        // decoder-only model
        if (llama_decode(ctx, batch) < 0) {
-            LOG_ERR("%s : failed to decode\n", __func__);
+            fprintf(stderr, "%s : failed to decode\n", __func__);
        }
    }

@@ -85,12 +84,14 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    gpt_init();
-
    params.embedding = true;
    // For non-causal models, batch size must be equal to ubatch size
    params.n_ubatch = params.n_batch;

+    print_build_info();
+
+    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
+
    llama_backend_init();
    llama_numa_init(params.numa);

@@ -100,7 +101,7 @@ int main(int argc, char ** argv) {
    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;
    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n", __func__);
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        return 1;
    }

@@ -110,19 +111,19 @@ int main(int argc, char ** argv) {
    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);

    if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
-        LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
+        fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__);
        return 1;
    }

    if (n_ctx > n_ctx_train) {
-        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, n_ctx);
    }

    // print system information
    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }

    // split the prompt into lines
@@ -137,7 +138,7 @@ int main(int argc, char ** argv) {
    for (const auto & prompt : prompts) {
        auto inp = ::llama_tokenize(ctx, prompt, true, false);
        if (inp.size() > n_batch) {
-            LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+            fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                    __func__, (long long int) inp.size(), (long long int) n_batch);
            return 1;
        }
@@ -148,20 +149,20 @@ int main(int argc, char ** argv) {
    // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
    for (auto & inp : inputs) {
        if (inp.empty() || inp.back() != llama_token_sep(model)) {
-            LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
-            LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
+            fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
+            fprintf(stderr, "%s:          'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
        }
    }

    // tokenization stats
    if (params.verbose_prompt) {
        for (int i = 0; i < (int) inputs.size(); i++) {
-            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
-            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
+            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
+            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
            for (int j = 0; j < (int) inputs[i].size(); j++) {
-                LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
+                fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
            }
-            LOG("\n\n");
+            fprintf(stderr, "\n\n");
        }
    }

@@ -212,57 +213,57 @@ int main(int argc, char ** argv) {
    batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);

    if (params.embd_out.empty()) {
-        LOG("\n");
+        fprintf(stdout, "\n");

        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
            for (int j = 0; j < n_embd_count; j++) {
-                LOG("embedding %d: ", j);
+                fprintf(stdout, "embedding %d: ", j);
                for (int i = 0; i < std::min(3, n_embd); i++) {
                    if (params.embd_normalize == 0) {
-                        LOG("%6.0f ", emb[j * n_embd + i]);
+                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
                    } else {
-                        LOG("%9.6f ", emb[j * n_embd + i]);
+                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
                    }
                }
-                LOG(" ... ");
+                fprintf(stdout, " ... ");
                for (int i = n_embd - 3; i < n_embd; i++) {
                    if (params.embd_normalize == 0) {
-                        LOG("%6.0f ", emb[j * n_embd + i]);
+                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
                    } else {
-                        LOG("%9.6f ", emb[j * n_embd + i]);
+                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
                    }
                }
-                LOG("\n");
+                fprintf(stdout, "\n");
            }
        } else {
            // print the first part of the embeddings or for a single prompt, the full embedding
            for (int j = 0; j < n_prompts; j++) {
-                LOG("embedding %d: ", j);
+                fprintf(stdout, "embedding %d: ", j);
                for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
                    if (params.embd_normalize == 0) {
-                        LOG("%6.0f ", emb[j * n_embd + i]);
+                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
                    } else {
-                        LOG("%9.6f ", emb[j * n_embd + i]);
+                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
                    }
                }
-                LOG("\n");
+                fprintf(stdout, "\n");
            }

            // print cosine similarity matrix
            if (n_prompts > 1) {
-                LOG("\n");
-                LOG("cosine similarity matrix:\n\n");
+                fprintf(stdout, "\n");
+                printf("cosine similarity matrix:\n\n");
                for (int i = 0; i < n_prompts; i++) {
-                    LOG("%6.6s ", prompts[i].c_str());
+                    fprintf(stdout, "%6.6s ", prompts[i].c_str());
                }
-                LOG("\n");
+                fprintf(stdout, "\n");
                for (int i = 0; i < n_prompts; i++) {
                    for (int j = 0; j < n_prompts; j++) {
                        float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                        LOG("%6.2f ", sim);
+                        fprintf(stdout, "%6.2f ", sim);
                    }
-                    LOG("%1.10s", prompts[i].c_str());
-                    LOG("\n");
+                    fprintf(stdout, "%1.10s", prompts[i].c_str());
+                    fprintf(stdout, "\n");
                }
            }
        }
@@ -271,43 +272,43 @@ int main(int argc, char ** argv) {
    if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
        const bool notArray = params.embd_out != "array";

-        LOG(notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
+        fprintf(stdout, notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
        for (int j = 0;;) { // at least one iteration (one prompt)
-            if (notArray) LOG("    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
-            LOG("[");
+            if (notArray) fprintf(stdout, "    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
+            fprintf(stdout, "[");
            for (int i = 0;;) { // at least one iteration (n_embd > 0)
-                LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
+                fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
                i++;
-                if (i < n_embd) LOG(","); else break;
+                if (i < n_embd) fprintf(stdout, ","); else break;
            }
-            LOG(notArray ? "]\n    }" : "]");
+            fprintf(stdout, notArray ? "]\n    }" : "]");
            j++;
-            if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
+            if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break;
        }
-        LOG(notArray ? "\n  ]" : "]\n");
+        fprintf(stdout, notArray ? "\n  ]" : "]\n");

        if (params.embd_out == "json+" && n_prompts > 1) {
-            LOG(",\n  \"cosineSimilarity\": [\n");
+            fprintf(stdout, ",\n  \"cosineSimilarity\": [\n");
            for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
-                LOG("    [");
+                fprintf(stdout, "    [");
                for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
                    float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                    LOG("%6.2f", sim);
+                    fprintf(stdout, "%6.2f", sim);
                    j++;
-                    if (j < n_embd_count) LOG(", "); else break;
+                    if (j < n_embd_count) fprintf(stdout, ", "); else break;
                }
-                LOG(" ]");
+                fprintf(stdout, " ]");
                i++;
-                if (i < n_embd_count) LOG(",\n"); else break;
+                if (i < n_embd_count) fprintf(stdout, ",\n"); else break;
            }
-            LOG("\n  ]");
+            fprintf(stdout, "\n  ]");
        }

-        if (notArray) LOG("\n}\n");
+        if (notArray) fprintf(stdout, "\n}\n");
    }

-    LOG("\n");
-    llama_perf_context_print(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    // clean up
    llama_batch_free(batch);
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -1,11 +1,12 @@
 #include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"
 #include "ggml.h"

 #include <cstdio>
+#include <random>
 #include <string>
+#include <tuple>
 #include <vector>

 /**
@@ -31,22 +32,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
    GGML_ASSERT(n > 0);
    float sum = 0;
    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        LOG("                                     [\n");
+        printf("                                     [\n");
        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
            if (i2 == n && ne[2] > 2*n) {
-                LOG("                                      ..., \n");
+                printf("                                      ..., \n");
                i2 = ne[2] - n;
            }
-            LOG("                                      [\n");
+            printf("                                      [\n");
            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
                if (i1 == n && ne[1] > 2*n) {
-                    LOG("                                       ..., \n");
+                    printf("                                       ..., \n");
                    i1 = ne[1] - n;
                }
-                LOG("                                       [");
+                printf("                                       [");
                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
                    if (i0 == n && ne[0] > 2*n) {
-                        LOG("..., ");
+                        printf("..., ");
                        i0 = ne[0] - n;
                    }
                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
@@ -64,16 +65,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                    } else {
                        GGML_ABORT("fatal error");
                    }
-                    LOG("%12.4f", v);
+                    printf("%12.4f", v);
                    sum += v;
-                    if (i0 < ne[0] - 1) LOG(", ");
+                    if (i0 < ne[0] - 1) printf(", ");
                }
-                LOG("],\n");
+                printf("],\n");
            }
-            LOG("                                      ],\n");
+            printf("                                      ],\n");
        }
-        LOG("                                     ]\n");
-        LOG("                                     sum = %f\n", sum);
+        printf("                                     ]\n");
+        printf("                                     sum = %f\n", sum);
    }
 }

@@ -102,11 +103,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
    }

-    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
-         t->name, ggml_type_name(t->type), ggml_op_desc(t),
-         src0->name, ggml_ne_string(src0).c_str(),
-         src1 ? src1_str : "",
-         ggml_ne_string(t).c_str());
+    printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+           t->name, ggml_type_name(t->type), ggml_op_desc(t),
+           src0->name, ggml_ne_string(src0).c_str(),
+           src1 ? src1_str : "",
+           ggml_ne_string(t).c_str());


    // copy the data from the GPU memory if needed
@@ -132,7 +133,7 @@ static bool run(llama_context * ctx, const gpt_params & params) {
    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);

    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
-        LOG_ERR("%s : failed to eval\n", __func__);
+        fprintf(stderr, "%s : failed to eval\n", __func__);
        return false;
    }

@@ -148,7 +149,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    gpt_init();
+    print_build_info();

    llama_backend_init();
    llama_numa_init(params.numa);
@@ -165,15 +166,14 @@ int main(int argc, char ** argv) {
    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;
    if (model == nullptr || ctx == nullptr) {
-        LOG_ERR("%s : failed to init\n", __func__);
+        fprintf(stderr, "%s : failed to init\n", __func__);
        return 1;
    }

    // print system information
    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
-        LOG_INF("\n");
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }

    bool OK = run(ctx, params);
@@ -181,8 +181,8 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    LOG("\n");
-    llama_perf_context_print(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -406,7 +406,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    g_verbose = (params.verbosity > 1);
+    g_verbose = (params.verbosity == 1);
    try {
        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
        ctx.run_merge();
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -152,7 +152,7 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
    }

-    if (argc - arg_idx != 2) {
+    if (argc - arg_idx < 2) {
        throw std::invalid_argument("error: bad arguments");
    }

@@ -389,17 +389,10 @@ static void gguf_merge(const split_params & split_params) {
    int n_split = 1;
    int total_tensors = 0;

-    // avoid overwriting existing output file
-    if (std::ifstream(split_params.output.c_str())) {
-        fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str());
-        exit(EXIT_FAILURE);
-    }
-
+    auto * ctx_out = gguf_init_empty();
    std::ofstream fout(split_params.output.c_str(), std::ios::binary);
    fout.exceptions(std::ofstream::failbit); // fail fast on write errors

-    auto * ctx_out = gguf_init_empty();
-
    std::vector<uint8_t> read_data;
    std::vector<ggml_context *> ctx_metas;
    std::vector<gguf_context *> ctx_ggufs;
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -158,8 +158,6 @@ int main(int argc, char * argv[]) {
        return 1;
    }

-    gpt_init();
-
    llama_model_params mparams = llama_model_params_from_gpt_params(params);
    llama_context_params cparams = llama_context_params_from_gpt_params(params);

--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -1,6 +1,5 @@
 #include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

 #include <cmath>
@@ -20,12 +19,12 @@
 #endif

 static void print_usage(int, char ** argv) {
-    LOG("\nexample usage:\n");
-    LOG("\n    %s \\\n"
-            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
            "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
            "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
-    LOG("\n");
+    LOG_TEE("\n");
 }

 struct Stats {
@@ -126,10 +125,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            e.counts.resize(src1->ne[0]*n_as, 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
-            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
+            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
            exit(1); //GGML_ABORT("fatal error");
        }
-        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
+        if (m_params.verbosity > 1) {
+            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
+        }
        // loop over all possible experts, regardless if they are used or not in the batch
        for (int ex = 0; ex < n_as; ++ex) {
            size_t e_start = ex*src1->ne[0];
@@ -150,8 +151,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                        e.values[e_start + j] += x[j]*x[j];
                        e.counts[e_start + j]++;
                        if (!std::isfinite(e.values[e_start + j])) {
-                            LOG("\n");
-                            LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
+                            fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
                            exit(1);
                        }
                    }
@@ -174,18 +174,20 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            e.counts.resize(src1->ne[0], 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]) {
-            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
+            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
            exit(1); //GGML_ABORT("fatal error");
        }
        ++e.ncall;
-        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+        if (m_params.verbosity > 1) {
+            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+        }
        for (int row = 0; row < (int)src1->ne[1]; ++row) {
            const float * x = data + row * src1->ne[0];
            for (int j = 0; j < (int)src1->ne[0]; ++j) {
                e.values[j] += x[j]*x[j];
                e.counts[j]++;
                if (!std::isfinite(e.values[j])) {
-                    LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
+                    fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
                    exit(1);
                }
            }
@@ -237,17 +239,17 @@ void IMatrixCollector::save_imatrix(int ncall) const {
        }

        if (n_zeros != 0 && is_first) {
-            LOG_INF("\n");
+            fprintf(stderr, "\n");
            is_first = false;
        }

        if (n_zeros == n_all) {
-            LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
+            fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
            continue;
        }

        if (n_zeros > 0) {
-            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+            fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
            continue;
        }

@@ -256,7 +258,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
    }

    if (to_store.size() < m_stats.size()) {
-        LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
+        fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
    }

    std::ofstream out(fname, std::ios::binary);
@@ -288,20 +290,21 @@ void IMatrixCollector::save_imatrix(int ncall) const {
        out.write(m_params.prompt_file.c_str(), len);
    }

-    LOGV(1, "\n");
-    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
+    if (m_params.verbosity > 0) {
+        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
+    }
 }

 bool IMatrixCollector::load_imatrix(const char * fname) {
    std::ifstream in(fname, std::ios::binary);
    if (!in) {
-        LOG_ERR("%s: failed to open %s\n",__func__, fname);
+        printf("%s: failed to open %s\n",__func__, fname);
        return false;
    }
    int n_entries;
    in.read((char*)&n_entries, sizeof(n_entries));
    if (in.fail() || n_entries < 1) {
-        LOG_ERR("%s: no data in file %s\n", __func__, fname);
+        printf("%s: no data in file %s\n", __func__, fname);
        return false;
    }
    for (int i = 0; i < n_entries; ++i) {
@@ -309,7 +312,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
        std::vector<char> name_as_vec(len+1);
        in.read((char *)name_as_vec.data(), len);
        if (in.fail()) {
-            LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
+            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
            return false;
        }
        name_as_vec[len] = 0;
@@ -320,7 +323,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
        int nval;
        in.read((char *)&nval, sizeof(nval));
        if (in.fail() || nval < 1) {
-            LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
+            printf("%s: failed reading number of values for entry %d\n",__func__,i);
            m_stats = {};
            return false;
        }
@@ -333,7 +336,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
        std::vector<float> tmp(nval);
        in.read((char*)tmp.data(), nval*sizeof(float));
        if (in.fail()) {
-            LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
+            printf("%s: failed reading data for entry %d\n",__func__,i);
            m_stats = {};
            return false;
        }
@@ -434,25 +437,26 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    const int n_ctx = llama_n_ctx(ctx);

    auto tim1 = std::chrono::high_resolution_clock::now();
-    LOG_INF("%s: tokenizing the input ..\n", __func__);
+    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);

    auto tim2 = std::chrono::high_resolution_clock::now();
-    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

    if (params.i_chunk > 0) {
        if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
-            LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
+            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
            return false;
        }
-        LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
+        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
        tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
    }

    if (int(tokens.size()) < 2*n_ctx) {
-        LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
-        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
+        fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
+                n_ctx);
+        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return false;
    }

@@ -474,7 +478,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    double nll = 0.0;
    double nll2 = 0.0;

-    LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
+    fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

@@ -510,7 +514,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {

            // TODO: use batch.logits to save computations instead of relying on logits_all == true
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                LOG_ERR("%s : failed to eval\n", __func__);
+                fprintf(stderr, "%s : failed to eval\n", __func__);
                return false;
            }

@@ -527,29 +531,29 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {

        if (i == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
            if (total_seconds >= 60*60) {
-                LOG("%d hours ", total_seconds / (60*60));
+                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            LOG("%.2f minutes\n", total_seconds / 60.0);
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }

        if (params.compute_ppl) {
            const int first = n_ctx/2;
-            const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
            count += n_ctx - first - 1;

-            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
            fflush(stdout);

            logits.clear();
        }
    }
-    LOG("\n");
+    printf("\n");

    if (params.compute_ppl) {
        nll2 /= count;
@@ -558,9 +562,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
        nll2 -= nll * nll;
        if (nll2 > 0) {
            nll2 = sqrt(nll2/(count-1));
-            LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+            printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
        } else {
-            LOG("Unexpected negative standard deviation of log(prob)\n");
+            printf("Unexpected negative standard deviation of log(prob)\n");
        }
    }

@@ -572,27 +576,26 @@ int main(int argc, char ** argv) {

    params.n_ctx = 512;
    params.logits_all = true;
+    params.verbosity = 1;

    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
        return 1;
    }

-    gpt_init();
-
    params.n_batch = std::min(params.n_batch, params.n_ctx);

    g_collector.set_params(params);

    for (const auto & in_file : params.in_files) {
-        LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
+        printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
        if (!g_collector.load_imatrix(in_file.c_str())) {
-            LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
+            fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
            return 1;
        }
    }

    if (params.in_files.size() > 1) {
-        LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
+        printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
        g_collector.save_imatrix();
    }

@@ -611,20 +614,20 @@ int main(int argc, char ** argv) {
    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;
    if (model == nullptr || ctx == nullptr) {
-        LOG_ERR("%s : failed to init\n", __func__);
+        fprintf(stderr, "%s : failed to init\n", __func__);
        return 1;
    }

    const int n_ctx_train = llama_n_ctx_train(model);
    if (params.n_ctx > n_ctx_train) {
-        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
+        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, params.n_ctx);
    }

    // print system information
    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }

    if (!compute_imatrix(ctx, params)) {
@@ -633,8 +636,8 @@ int main(int argc, char ** argv) {

    g_collector.save_imatrix();

-    LOG("\n");
-    llama_perf_context_print(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -2,7 +2,6 @@
 #include "common.h"
 #include "console.h"
 #include "sampling.h"
-#include "log.h"
 #include "llama.h"

 #include <cassert>
@@ -56,7 +55,7 @@ static void write_logfile(

    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
-        LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
+        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
        return;
    }
@@ -65,7 +64,7 @@ static void write_logfile(
    FILE * logfile = fopen(logfile_path.c_str(), "w");

    if (logfile == NULL) {
-        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }

@@ -94,7 +93,7 @@ static void sigint_handler(int signo) {
            is_interacting = true;
        } else {
            console::cleanup();
-            LOG("\n");
+            printf("\n");
            gpt_perf_print(*g_ctx, *g_smpl);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            _exit(130);
@@ -111,51 +110,58 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    gpt_init();
-
    auto & sparams = params.sparams;

+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("infill", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
    console::init(params.simple_io, params.use_color);
    atexit([]() { console::cleanup(); });

    if (params.logits_all) {
-        LOG_ERR("\n************\n");
-        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        LOG_ERR("************\n\n");
+        printf("\n************\n");
+        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        printf("************\n\n");

        return 0;
    }

    if (params.embedding) {
-        LOG_ERR("\n************\n");
-        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        LOG_ERR("************\n\n");
+        printf("\n************\n");
+        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        printf("************\n\n");

        return 0;
    }

    if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
+        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
    }
-
    if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
-        LOG_ERR("\n************\n");
-        LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
-        LOG_ERR("************\n\n");
+        printf("\n************\n");
+        printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
+        printf("************\n\n");

        return 0;
    }

    if (params.rope_freq_base != 0.0) {
-        LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
    }

    if (params.rope_freq_scale != 0.0) {
-        LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }

-    LOG_INF("%s: llama backend init\n", __func__);
+    print_build_info();
+
+    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
+
+    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);

@@ -168,32 +174,34 @@ int main(int argc, char ** argv) {
    g_smpl = &smpl;

    // load the model and apply lora adapter, if any
-    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
+    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
    llama_init_result llama_init = llama_init_from_gpt_params(params);

    model = llama_init.model;
    ctx = llama_init.context;

    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n", __func__);
+        LOG_TEE("%s: error: unable to load model\n", __func__);
        return 1;
    }

    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);
-    LOG_DBG("n_ctx: %d\n", n_ctx);
+    LOG("n_ctx: %d\n", n_ctx);

    if (n_ctx > n_ctx_train) {
-        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
+        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
+                __func__, n_ctx_train, n_ctx);
    }

    // print system information
    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_TEE("\n");
+        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
    }
    const bool add_bos = llama_add_bos_token(model);
    GGML_ASSERT(!llama_add_eos_token(model));
+    LOG("add_bos: %d\n", add_bos);

    std::vector<llama_token> embd_inp;
    std::vector<llama_token> embd_end;
@@ -218,19 +226,18 @@ int main(int argc, char ** argv) {
        embd_inp.push_back(middle_token);
    }

-    LOG_DBG("add_bos: %d\n", add_bos);
-    LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
-    LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
-    LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
+    LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
+    LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
+    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());

    // Should not run without any tokens
    if (embd_inp.empty()) {
        embd_inp.push_back(llama_token_bos(model));
-        LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }

    if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
    }

@@ -239,8 +246,9 @@ int main(int argc, char ** argv) {
        params.n_keep = (int)embd_inp.size();
    }

-    LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
-    LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
+    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
+    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
+

    // enable interactive mode if interactive start is specified
    if (params.interactive_first) {
@@ -248,21 +256,21 @@ int main(int argc, char ** argv) {
    }

    if (params.verbose_prompt) {
-        LOG_INF("\n");
-        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_TEE("\n");
+        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }

        if (params.n_keep > 0) {
-        LOG_INF("%s: static prompt based on n_keep: '", __func__);
+        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
            }
-            LOG("'\n");
+            LOG_TEE("'\n");
        }
-        LOG_INF("\n");
+        LOG_TEE("\n");
    }

    if (params.interactive) {
@@ -279,30 +287,25 @@ int main(int argc, char ** argv) {
        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif

-        LOG_INF("%s: interactive mode on.\n", __func__);
+        LOG_TEE("%s: interactive mode on.\n", __func__);

        if (params.input_prefix_bos) {
-            LOG_INF("Input prefix with BOS\n");
+            LOG_TEE("Input prefix with BOS\n");
        }

        if (!params.input_prefix.empty()) {
-            LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
        }

        if (!params.input_suffix.empty()) {
-            LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
        }
    }
-    smpl = gpt_sampler_init(model, sparams);
+    LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
+    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_TEE("\n\n");

-    LOG_INF("sampler seed: %u\n",     gpt_sampler_get_seed(smpl));
-    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
-    LOG_INF("sampler chain: %s\n",    gpt_sampler_print(smpl).c_str());
-
-    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-
-    LOG("\n");
-    LOG("\n#####  Infill mode  #####\n\n");
+    LOG_TEE("\n#####  Infill mode  #####\n\n");
    if (params.interactive) {
        const char *control_message;
        if (params.multiline_input) {
@@ -313,11 +316,11 @@ int main(int argc, char ** argv) {
                              " - To return control without starting a new line, end your input with '/'.\n"
                              " - If you want to submit another line, end your input with '\\'.\n";
        }
-        LOG("== Running in interactive mode. ==\n");
+        LOG_TEE("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG(       " - Press Ctrl+C to interject at any time.\n");
+        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG(       "%s\n", control_message);
+        LOG_TEE(       "%s\n", control_message);

        is_interacting = params.interactive_first;
    }
@@ -337,6 +340,8 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd;

+    smpl = gpt_sampler_init(model, sparams);
+
    while (n_remain != 0 || params.interactive) {
        // predict
        if (!embd.empty()) {
@@ -350,8 +355,9 @@ int main(int argc, char ** argv) {
                embd.resize(max_embd_size);

                console::set_display(console::error);
-                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                console::set_display(console::reset);
+                fflush(stdout);
            }

            // infinite text generation via context swapping
@@ -360,14 +366,14 @@ int main(int argc, char ** argv) {
            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
            if (n_past + (int) embd.size() > n_ctx) {
                if (params.n_predict == -2) {
-                    LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                    break;
                }

                const int n_left    = n_past - params.n_keep - 1;
                const int n_discard = n_left/2;

-                LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                    n_past, n_left, n_ctx, params.n_keep, n_discard);

                llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
@@ -375,9 +381,9 @@ int main(int argc, char ** argv) {

                n_past -= n_discard;

-                LOG_DBG("after swap: n_past = %d\n", n_past);
+                LOG("after swap: n_past = %d\n", n_past);

-                LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());

            }

@@ -389,16 +395,16 @@ int main(int argc, char ** argv) {
                    n_eval = params.n_batch;
                }

-                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());

                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_ERR("%s : failed to eval\n", __func__);
+                    LOG_TEE("%s : failed to eval\n", __func__);
                    return 1;
                }

                n_past += n_eval;

-                LOG_DBG("n_past = %d\n", n_past);
+                LOG("n_past = %d\n", n_past);
            }

        }
@@ -410,7 +416,7 @@ int main(int argc, char ** argv) {

            gpt_sampler_accept(smpl, id, true);

-            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
+            // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());

            embd.push_back(id);

@@ -420,10 +426,10 @@ int main(int argc, char ** argv) {
            // decrement remaining sampling budget
            --n_remain;

-            LOG_DBG("n_remain: %d\n", n_remain);
+            LOG("n_remain: %d\n", n_remain);
        } else {
            // some user input remains from prompt or interaction, forward it to processing
-            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);

@@ -442,7 +448,7 @@ int main(int argc, char ** argv) {
        if (input_echo) {
            for (auto id : embd) {
                const std::string token_str = llama_token_to_piece(ctx, id);
-                LOG("%s", token_str.c_str());
+                printf("%s", token_str.c_str());

                if (embd.size() > 1) {
                    input_tokens.push_back(id);
@@ -451,6 +457,7 @@ int main(int argc, char ** argv) {
                    output_ss << token_str;
                }
            }
+            fflush(stdout);
        }
        // reset color to default if we there is no pending user input
        if (input_echo && (int) embd_inp.size() == n_consumed) {
@@ -463,9 +470,10 @@ int main(int argc, char ** argv) {
            if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
                if (is_interacting && !params.interactive_first) {
                    // print an eot token
-                    LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
                }
-                LOG("\n");
+                fflush(stdout);
+                printf("\n");
                console::set_display(console::user_input);
                std::string buffer;
                std::string line;
@@ -521,33 +529,35 @@ int main(int argc, char ** argv) {
                n_remain = params.n_predict;
                n_past = 0;
                n_consumed = 0;
+                // LOG_TEE("took new input\n");
                is_interacting = false;
            }
            // deal with end of generation tokens in interactive mode
            else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
-                LOG_DBG("found EOS token\n");
+                LOG("found EOS token\n");

                if (params.interactive) {

                    is_interacting = true;
-                    LOG("\n");
+                    printf("\n");
                    console::set_display(console::user_input);
+                    fflush(stdout);
               }
            }

            if (n_past > 0 && is_interacting && !params.interactive) {
-                LOG_DBG("waiting for user input\n");
+                LOG("waiting for user input\n");

                if (params.input_prefix_bos) {
-                    LOG_DBG("adding input prefix BOS token\n");
+                    LOG("adding input prefix BOS token\n");
                    embd_inp.push_back(llama_token_bos(model));
                }

                std::string buffer;
                if (!params.input_prefix.empty()) {
-                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                    buffer += params.input_prefix;
-                    LOG("%s", buffer.c_str());
+                    printf("%s", buffer.c_str());
                }

                std::string line;
@@ -565,17 +575,17 @@ int main(int argc, char ** argv) {
                if (buffer.length() > 1) {
                    // append input suffix if any
                    if (!params.input_suffix.empty()) {
-                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                        buffer += params.input_suffix;
-                        LOG("%s", params.input_suffix.c_str());
+                        printf("%s", params.input_suffix.c_str());
                    }

-                    LOG_DBG("buffer: '%s'\n", buffer.c_str());
+                    LOG("buffer: '%s'\n", buffer.c_str());

                    const size_t original_size = embd_inp.size();

                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

@@ -586,9 +596,9 @@ int main(int argc, char ** argv) {
                    }

                    n_remain -= line_inp.size();
-                    LOG_DBG("n_remain: %d\n", n_remain);
+                    LOG("n_remain: %d\n", n_remain);
                } else {
-                    LOG_DBG("empty line, passing control back\n");
+                    LOG("empty line, passing control back\n");
                }

                input_echo = false; // do not echo this again
@@ -615,10 +625,11 @@ int main(int argc, char ** argv) {
        }
    }
    if (!params.interactive && n_remain <= 0) {
-        LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+        printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+        fflush(stdout);
    }

-    LOG("\n");
+    LOG_TEE("\n");
    gpt_perf_print(ctx, smpl);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

@@ -628,5 +639,9 @@ int main(int argc, char ** argv) {
    gpt_sampler_free(smpl);
    llama_backend_free();

+#ifndef LOG_DISABLE_LOGS
+    LOG_TEE("Log end\n");
+#endif // LOG_DISABLE_LOGS
+
    return 0;
 }
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
            fflush(p_err->fout);
        }

-        llama_perf_context_print(ctx);
+        llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

        llama_free(ctx);

--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
 3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf.py \
+python ./examples/llava/convert_image_encoder_to_gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
    --output-dir path/to/MobileVLM-1.7B \
@@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf.py \
 ```

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf.py \
+python ./examples/llava/convert_image_encoder_to_gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
    --output-dir path/to/MobileVLM-1.7B_V2 \
@@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf.py \
 4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
+python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
 ```

-5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
+5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
 ```sh
-./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
+./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
 ```

 Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -3,6 +3,7 @@
 // I'll gradually clean and extend it
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
+#include "log.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@@ -39,11 +40,6 @@
 #include <cinttypes>
 #include <limits>

-#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-
 //#define CLIP_DEBUG_FUNCTIONS

 // RGB uint8 image
@@ -169,7 +165,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
 static int get_key_idx(const gguf_context * ctx, const char * key) {
    int i = gguf_find_key(ctx, key);
    if (i == -1) {
-        LOG_ERR("key %s not found in file\n", key);
+        LOG_TEE("key %s not found in file\n", key);
        throw std::runtime_error(format("Missing required key: %s", key));
    }

@@ -274,7 +270,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {

 static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
    size_t tensor_size = ggml_nbytes(tensor);
-    LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
+    LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
            prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
 }
@@ -292,7 +288,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
 static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
    std::ofstream file(filename, std::ios::binary);
    if (!file.is_open()) {
-        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
+        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
        return;
    }

@@ -311,7 +307,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
 static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
    std::ofstream file(filename, std::ios::binary);
    if (!file.is_open()) {
-        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
+        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
        return;
    }

@@ -572,7 +568,7 @@ struct clip_ctx {

 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
    if (!ctx->has_vision_encoder) {
-        LOG_ERR("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return nullptr;
    }

@@ -586,7 +582,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        if (load_image_size == nullptr) {
            load_image_size = clip_image_size_init();
        }
-        LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
        image_size_width  = load_image_size->width;
        image_size_height = load_image_size->height;
        if (is_inf) {
@@ -1051,21 +1047,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        const int idx_name = gguf_find_key(ctx, KEY_NAME);
        if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
            const std::string name = gguf_get_val_str(ctx, idx_name);
-            LOG_INF("%s: model name:   %s\n", __func__, name.c_str());
+            LOG_TEE("%s: model name:   %s\n", __func__, name.c_str());
        }
-        LOG_INF("%s: description:  %s\n", __func__, description.c_str());
-        LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
-        LOG_INF("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
-        LOG_INF("%s: n_tensors:    %d\n", __func__, n_tensors);
-        LOG_INF("%s: n_kv:         %d\n", __func__, n_kv);
-        LOG_INF("%s: ftype:        %s\n", __func__, ftype_str.c_str());
-        LOG_INF("\n");
+        LOG_TEE("%s: description:  %s\n", __func__, description.c_str());
+        LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
+        LOG_TEE("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
+        LOG_TEE("%s: n_tensors:    %d\n", __func__, n_tensors);
+        LOG_TEE("%s: n_kv:         %d\n", __func__, n_kv);
+        LOG_TEE("%s: ftype:        %s\n", __func__, ftype_str.c_str());
+        LOG_TEE("\n");
    }
    const int n_tensors = gguf_get_n_tensors(ctx);

    // kv
    const int n_kv = gguf_get_n_kv(ctx);
-    LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+    LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
        __func__, n_kv, n_tensors, fname);
    {
        std::map<enum ggml_type, uint32_t> n_type;
@@ -1076,7 +1072,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            n_type[type]++;
        }

-        LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
        for (int i = 0; i < n_kv; i++) {
            const char * name           = gguf_get_key(ctx, i);
            const enum gguf_type type   = gguf_get_kv_type(ctx, i);
@@ -1092,7 +1088,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            }
            replace_all(value, "\n", "\\n");

-            LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+            LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
        }

        // print type counts
@@ -1101,7 +1097,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                continue;
            }

-            LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+            LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
        }
    }

@@ -1116,7 +1112,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            size_t tensor_size = ggml_nbytes(cur);
            model_size += tensor_size;
            if (verbosity >= 3) {
-                LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+                LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
                       __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
            }
        }
@@ -1143,27 +1139,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

 #ifdef GGML_USE_CUDA
    new_clip->backend = ggml_backend_cuda_init(0);
-    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
+    LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
 #endif

 #ifdef GGML_USE_METAL
    new_clip->backend = ggml_backend_metal_init();
-    LOG_INF("%s: CLIP using Metal backend\n", __func__);
+    LOG_TEE("%s: CLIP using Metal backend\n", __func__);
 #endif

 #ifdef GGML_USE_CANN
    new_clip->backend = ggml_backend_cann_init(0);
-    LOG_INF("%s: CLIP using CANN backend\n", __func__);
+    LOG_TEE("%s: CLIP using CANN backend\n", __func__);
 #endif

 #ifdef GGML_USE_VULKAN
    new_clip->backend = ggml_backend_vk_init(0);
-    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
+    LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
 #endif

    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
-        LOG_INF("%s: CLIP using CPU backend\n", __func__);
+        LOG_TEE("%s: CLIP using CPU backend\n", __func__);
    }

    // model size and capabilities
@@ -1198,16 +1194,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        new_clip->use_gelu = gguf_get_val_bool(ctx, idx);

        if (verbosity >= 1) {
-            LOG_INF("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
-            LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
-            LOG_INF("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
-            LOG_INF("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
-            LOG_INF("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
-            LOG_INF("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
+            LOG_TEE("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
+            LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
+            LOG_TEE("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
+            LOG_TEE("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
+            LOG_TEE("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
+            LOG_TEE("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
        }
    }

-    LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
+    LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);

    // load tensors
    {
@@ -1220,7 +1216,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

        new_clip->ctx_data = ggml_init(params);
        if (!new_clip->ctx_data) {
-            LOG_ERR("%s: ggml_init() failed\n", __func__);
+            LOG_TEE("%s: ggml_init() failed\n", __func__);
            clip_free(new_clip);
            gguf_free(ctx);
            return nullptr;
@@ -1228,7 +1224,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

        auto fin = std::ifstream(fname, std::ios::binary);
        if (!fin) {
-            LOG_ERR("cannot open model file for loading tensors\n");
+            LOG_TEE("cannot open model file for loading tensors\n");
            clip_free(new_clip);
            gguf_free(ctx);
            return nullptr;
@@ -1250,7 +1246,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
            fin.seekg(offset, std::ios::beg);
            if (!fin) {
-                LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
+                LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
                clip_free(new_clip);
                gguf_free(ctx);
                return nullptr;
@@ -1321,23 +1317,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }

        if (verbosity >= 2) {
-            LOG_INF("\n%s: vision model hparams\n", __func__);
-            LOG_INF("image_size         %d\n", hparams.image_size);
-            LOG_INF("patch_size         %d\n", hparams.patch_size);
-            LOG_INF("v_hidden_size      %d\n", hparams.hidden_size);
-            LOG_INF("v_n_intermediate   %d\n", hparams.n_intermediate);
-            LOG_INF("v_projection_dim   %d\n", hparams.projection_dim);
-            LOG_INF("v_n_head           %d\n", hparams.n_head);
-            LOG_INF("v_n_layer          %d\n", hparams.n_layer);
-            LOG_INF("v_eps              %f\n", hparams.eps);
-            LOG_INF("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
-            LOG_INF("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
-            LOG_INF("v_image_grid_pinpoints: ");
+            LOG_TEE("\n%s: vision model hparams\n", __func__);
+            LOG_TEE("image_size         %d\n", hparams.image_size);
+            LOG_TEE("patch_size         %d\n", hparams.patch_size);
+            LOG_TEE("v_hidden_size      %d\n", hparams.hidden_size);
+            LOG_TEE("v_n_intermediate   %d\n", hparams.n_intermediate);
+            LOG_TEE("v_projection_dim   %d\n", hparams.projection_dim);
+            LOG_TEE("v_n_head           %d\n", hparams.n_head);
+            LOG_TEE("v_n_layer          %d\n", hparams.n_layer);
+            LOG_TEE("v_eps              %f\n", hparams.eps);
+            LOG_TEE("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
+            LOG_TEE("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
+            LOG_TEE("v_image_grid_pinpoints: ");
            for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
-                LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
+                LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
            }
-            LOG_INF("\n");
-            LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
+            LOG_TEE("\n");
+            LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);

        }

@@ -1375,7 +1371,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
        } catch(const std::exception& /*e*/) {
-            LOG_ERR("%s: failed to load vision model tensors\n", __func__);
+            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
        }

        // LLaVA projection
@@ -1404,7 +1400,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            } catch (std::runtime_error & /*e*/) { }
            try {
                vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
-                // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
+                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
            } catch (std::runtime_error & /*e*/) { }
        } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projection
@@ -1505,7 +1501,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
        ggml_gallocr_reserve(new_clip->compute_alloc, gf);
        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
-        LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
+        LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
    }

    return new_clip;
@@ -1556,7 +1552,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
    int nx, ny, nc;
    auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
    if (!data) {
-        LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
+        LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
        return false;
    }
    build_clip_img_from_data(data, nx, ny, img);
@@ -1568,7 +1564,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
    int nx, ny, nc;
    auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
    if (!data) {
-        LOG_ERR("%s: failed to decode image bytes\n", __func__);
+        LOG_TEE("%s: failed to decode image bytes\n", __func__);
        return false;
    }
    build_clip_img_from_data(data, nx, ny, img);
@@ -1758,7 +1754,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
        int downscaled_height = static_cast<int>(original_height * scale);
        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
        int wasted_resolution = (width * height) - effective_resolution;
-        // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
            max_effective_resolution = effective_resolution;
            min_wasted_resolution = wasted_resolution;
@@ -1876,7 +1872,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
    const int multiple = fmin(ceil(ratio), max_slice_nums);

    std::vector<std::vector<clip_image_u8 *>> images;
-    LOG_INF("%s: multiple %d\n", __func__, multiple);
+    LOG_TEE("%s: multiple %d\n", __func__, multiple);
    images.push_back(std::vector<clip_image_u8 *>());

    if (multiple <= 1) {
@@ -1891,17 +1887,17 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
        clip_image_u8 * source_image = clip_image_u8_init();
        bicubic_resize(*img, *source_image, best_size.first, best_size.second);
        // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
-        LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
+        LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
        images[images.size()-1].push_back(source_image);

        std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
-        LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
+        LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);

        auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
        clip_image_u8 * refine_image = clip_image_u8_init();
        bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);

-        LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
+        LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);

        // split_to_patches
        int width = refine_image->nx;
@@ -1958,7 +1954,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
        int idx = 0;
        for (size_t i = 0; i < imgs.size(); ++i) {
            for (size_t j = 0; j < imgs[i].size(); ++j) {
-                LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
+                LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
                clip_image_f32 * res = clip_image_f32_init();
                normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
                res_imgs->data[idx++] = *res;
@@ -1970,7 +1966,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli

    bool pad_to_square = true;
    if (!ctx->has_vision_encoder) {
-        LOG_ERR("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return false;
    }
    auto & params = ctx->vision_model.hparams;
@@ -2047,7 +2043,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
            }

            for (size_t i = 0; i < patches.size(); i++) {
-                // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+                // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
                clip_image_u8_free(patches[i]);
            }

@@ -2283,7 +2279,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co

 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
    if (!ctx->has_vision_encoder) {
-        LOG_ERR("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return false;
    }

@@ -2295,7 +2291,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3

 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
    if (!ctx->has_vision_encoder) {
-        LOG_ERR("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return false;
    }

@@ -2525,7 +2521,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            new_type = type;
            if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
                new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
-                // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
+                // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
            }
            const size_t n_elms = ggml_nelements(cur);
            float * f32_data;
@@ -2544,7 +2540,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                f32_data = (float *)conv_buf.data();
                break;
            default:
-                LOG_ERR("Please use an input file in f32 or f16\n");
+                LOG_TEE("Please use an input file in f32 or f16\n");
                gguf_free(ctx_out);
                return false;
            }
@@ -2571,7 +2567,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            fout.put(0);
        }

-        LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
+        LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
               orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
    }

@@ -2587,8 +2583,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
    gguf_free(ctx_out);

    {
-        LOG_INF("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
-        LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+        LOG_TEE("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
    }

    return true;
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -10,7 +10,6 @@

 #include <cstdio>
 #include <cstdlib>
-#include <cstring>
 #include <vector>

 static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
@@ -21,7 +20,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
            n_eval = n_batch;
        }
        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
            return false;
        }
        *n_past += n_eval;
@@ -76,7 +75,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
    size_t img_base64_str_start, img_base64_str_end;
    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
-        LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
+        LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
        return NULL;
    }

@@ -90,7 +89,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip

    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
    if (!embed) {
-        LOG_ERR("%s: could not load image from base64 string.\n", __func__);
+        LOG_TEE("%s: could not load image from base64 string.\n", __func__);
        return NULL;
    }

@@ -115,9 +114,9 @@ struct llava_context {
 };

 static void print_usage(int, char ** argv) {
-    LOG("\n example usage:\n");
-    LOG("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
+    LOG_TEE("\n example usage:\n");
+    LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }

 static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
@@ -127,11 +126,11 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
    auto prompt = params->prompt;
    if (prompt_contains_image(prompt)) {
        if (!params->image.empty()) {
-            LOG_INF("using base64 encoded image instead of command line image path\n");
+            LOG_TEE("using base64 encoded image instead of command line image path\n");
        }
        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
        if (!embed) {
-            LOG_ERR("%s: can't load image from prompt\n", __func__);
+            LOG_TEE("%s: can't load image from prompt\n", __func__);
            return NULL;
        }
        params->prompt = remove_image_from_prompt(prompt);
@@ -157,18 +156,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
        system_prompt = prompt.substr(0, image_pos);
        user_prompt = prompt.substr(image_pos + std::string("<image>").length());
-        LOG_INF("system_prompt: %s\n", system_prompt.c_str());
+        LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
-        LOG_INF("user_prompt: %s\n", user_prompt.c_str());
+        LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
    } else {
@@ -178,7 +177,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
    }
@@ -189,11 +188,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_

    // generate the response

-    LOG("\n");
+    LOG_TEE("\n");

    struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
    if (!smpl) {
-        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
+        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
        exit(1);
    }

@@ -203,7 +202,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        response += tmp;
        if (strcmp(tmp, "</s>") == 0) break;
        if (strstr(tmp, "###")) break; // Yi-VL behavior
-        LOG("%s", tmp);
+        printf("%s", tmp);
        if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
        if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
        if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@@ -212,7 +211,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
    }

    gpt_sampler_free(smpl);
-    LOG("\n");
+    printf("\n");
 }

 static struct llama_model * llava_init(gpt_params * params) {
@@ -223,7 +222,7 @@ static struct llama_model * llava_init(gpt_params * params) {

    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n" , __func__);
+        LOG_TEE("%s: error: unable to load model\n" , __func__);
        return NULL;
    }
    return model;
@@ -246,11 +245,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);

    if (ctx_llama == NULL) {
-        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
+        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
        return NULL;
    }

-    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));

    ctx_llava->ctx_llama = ctx_llama;
    ctx_llava->ctx_clip = ctx_clip;
@@ -269,6 +268,12 @@ static void llava_free(struct llava_context * ctx_llava) {
    llama_backend_free();
 }

+static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    LOG_TEE("%s", text);
+}
+
 int main(int argc, char ** argv) {
    ggml_time_init();

@@ -278,45 +283,49 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    gpt_init();
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("llava", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+    llama_log_set(llama_log_callback_logTee, nullptr);
+#endif // LOG_DISABLE_LOGS

    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
        print_usage(argc, argv);
        return 1;
    }
-
-    auto * model = llava_init(&params);
+    auto model = llava_init(&params);
    if (model == NULL) {
        fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
        return 1;
    }

    if (prompt_contains_image(params.prompt)) {
-        auto * ctx_llava = llava_init_context(&params, model);
+        auto ctx_llava = llava_init_context(&params, model);

-        auto * image_embed = load_image(ctx_llava, &params, "");
+        auto image_embed = load_image(ctx_llava, &params, "");

        // process the prompt
        process_prompt(ctx_llava, image_embed, &params, params.prompt);

-        llama_perf_context_print(ctx_llava->ctx_llama);
+        llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
        llava_image_embed_free(image_embed);
        ctx_llava->model = NULL;
        llava_free(ctx_llava);
    } else {
        for (auto & image : params.image) {
-            auto * ctx_llava = llava_init_context(&params, model);
+            auto ctx_llava = llava_init_context(&params, model);

-            auto * image_embed = load_image(ctx_llava, &params, image);
+            auto image_embed = load_image(ctx_llava, &params, image);
            if (!image_embed) {
-                LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
+                std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
                return 1;
            }

            // process the prompt
            process_prompt(ctx_llava, image_embed, &params, params.prompt);

-            llama_perf_context_print(ctx_llava->ctx_llama);
+            llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
            llava_image_embed_free(image_embed);
            ctx_llava->model = NULL;
            llava_free(ctx_llava);
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -1,23 +1,13 @@
 #include "clip.h"
-#include "llava.h"
-
+#include "common.h"
 #include "llama.h"
+#include "llava.h"
+#include "base64.hpp"

-#include <algorithm>
-#include <cerrno>
 #include <cstdio>
 #include <cstdlib>
-#include <cstring>
-#include <limits>
 #include <vector>
-
-#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
-#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
-
-#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#include <numeric>

 // RGB uint8 image
 struct clip_image_u8 {
@@ -64,7 +54,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
        int downscaled_height = static_cast<int>(original_height * scale);
        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
        int wasted_resolution = (width * height) - effective_resolution;
-        // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
            max_effective_resolution = effective_resolution;
            min_wasted_resolution = wasted_resolution;
@@ -193,7 +183,9 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,  size_ele * clip_n_mmproj_embd(ctx_clip), 0);
    // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
    ggml_build_forward_expand(gf, flatten);
-    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
+    ggml_graph_prepare(gf, 1, nullptr);
+    ggml_graph_work_init(gf, model.ctx);
+    ggml_graph_compute(gf);
    struct ggml_tensor* result = ggml_graph_node(gf, -1);

    memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
@@ -246,7 +238,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
    img_res_v.size = 0;
    img_res_v.data = nullptr;
    if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
-        LOG_ERR("%s: unable to preprocess image\n", __func__);
+        LOG_TEE("%s: unable to preprocess image\n", __func__);
        delete[] img_res_v.data;
        return false;
    }
@@ -275,14 +267,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
            }
            if (!encoded) {
-                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                return false;
            }
            const int64_t t_img_enc_steop_batch_us = ggml_time_us();
-            LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
+            LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
        }
        const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);

        int n_img_pos_out = 0;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -297,7 +289,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        load_image_size->width = img->nx;
        load_image_size->height = img->ny;
        clip_add_load_image_size(ctx_clip, load_image_size);
-        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
    }
    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
        // flat / default llava-1.5 type embedding
@@ -305,7 +297,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
        delete[] img_res_v.data;
        if (!encoded) {
-            LOG_ERR("Unable to encode image\n");
+            LOG_TEE("Unable to encode image\n");

            return false;
        }
@@ -319,12 +311,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
            const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
            if (!encoded) {
-                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                return false;
            }
        }
        const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);

        const int32_t * image_grid = clip_image_grid(ctx_clip);

@@ -357,12 +349,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
    }

-    LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+    LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);

    const int64_t t_img_enc_end_us = ggml_time_us();
    float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;

-    LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
+    LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);

    return true;
 }
@@ -372,7 +364,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
    auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
    if (n_image_embd != n_llama_embd) {
-        LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
+        LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
        return false;
    }
    return true;
@@ -385,13 +377,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
    }
    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
    if (!image_embd) {
-        LOG_ERR("Unable to allocate memory for image embeddings\n");
+        LOG_TEE("Unable to allocate memory for image embeddings\n");
        return false;
    }

    int n_img_pos;
    if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
-        LOG_ERR("%s: cannot encode image, aborting\n", __func__);
+        LOG_TEE("%s: cannot encode image, aborting\n", __func__);
        free(image_embd);
        return false;
    }
@@ -411,7 +403,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
        }
        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
        if (llama_decode(ctx_llama, batch)) {
-            LOG_ERR("%s : failed to eval\n", __func__);
+            LOG_TEE("%s : failed to eval\n", __func__);
            return false;
        }
        *n_past += n_eval;
@@ -423,7 +415,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
    clip_image_u8 * img = clip_image_u8_init();
    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
        clip_image_u8_free(img);
-        LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
+        LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
        return NULL;
    }

@@ -432,7 +424,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
    bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
    if (!image_embed_result) {
        clip_image_u8_free(img);
-        LOG_ERR("%s: coulnd't embed the image\n", __func__);
+        LOG_TEE("%s: coulnd't embed the image\n", __func__);
        return NULL;
    }

@@ -446,7 +438,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
 static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
    auto file = fopen(path, "rb");
    if (file == NULL) {
-        LOG_ERR("%s: can't read file %s\n", __func__, path);
+        LOG_TEE("%s: can't read file %s\n", __func__, path);
        return false;
    }

@@ -456,7 +448,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long

    auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
    if (buffer == NULL) {
-        LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+        LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
        perror("Memory allocation error");
        fclose(file);
        return false;
@@ -481,7 +473,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
    long image_bytes_length;
    auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
    if (!loaded) {
-        LOG_ERR("%s: failed to load %s\n", __func__, image_path);
+        LOG_TEE("%s: failed to load %s\n", __func__, image_path);
        return NULL;
    }

--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -7,12 +7,9 @@
 #include "llama.h"
 #include "ggml.h"

-#include <algorithm>
 #include <cstdio>
 #include <cstdlib>
-#include <cstring>
 #include <vector>
-#include <iostream> // TODO: remove me

 struct llava_context {
    struct clip_ctx * ctx_clip = NULL;
@@ -21,8 +18,14 @@ struct llava_context {
 };

 static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
+    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+}
+
+static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    LOG_TEE("%s", text);
 }

 static struct llama_model * llava_init(gpt_params * params) {
@@ -33,7 +36,7 @@ static struct llama_model * llava_init(gpt_params * params) {

    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n" , __func__);
+        LOG_TEE("%s: error: unable to load model\n" , __func__);
        return NULL;
    }
    return model;
@@ -48,7 +51,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
    if (params->n_ctx < 2048) {
        // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
-        LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
+        LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
        ctx_params.n_ctx = 2048;
    } else {
        ctx_params.n_ctx = params->n_ctx;
@@ -57,11 +60,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);

    if (ctx_llama == NULL) {
-        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
+        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
        return NULL;
    }

-    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));

    ctx_llava->ctx_llama = ctx_llama;
    ctx_llava->model = model;
@@ -86,7 +89,7 @@ static struct clip_ctx * clip_init_context(gpt_params * params) {
    if (prompt.empty()) {
        prompt = "describe the image in detail.";
    }
-    auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
    return ctx_clip;
 }

@@ -98,7 +101,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
            n_eval = n_batch;
        }
        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
            return false;
        }
        *n_past += n_eval;
@@ -122,7 +125,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str
    float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
    std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));

-    auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
+    auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
    slice_embed->embed = image_embed;
    slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
    llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
@@ -140,7 +143,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
    else if (has_minicpmv_projector == 3) {
        system_prompt = "<|im_start|>user\n";
    }
-    LOG_INF("%s: image token past: %d\n", __func__, n_past);
+    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
    eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
    eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
@@ -159,7 +162,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
        }
        eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
    }
-    LOG_INF("%s: image token past: %d\n", __func__, n_past);
+    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
 }

 static const char * sample(struct gpt_sampler * smpl,
@@ -178,42 +181,42 @@ static const char * sample(struct gpt_sampler * smpl,
 }

 static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
-    auto * ctx_clip = clip_init_context(params);
-    auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
+    auto ctx_clip = clip_init_context(params);
+    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
    if (!embeds) {
-        LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
+        std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
        return NULL;
    }

    // process the prompt
    if (params->prompt.empty() && params->interactive == false) {
-        LOG_ERR("prompt should be given or interactive mode should be on");
+        LOG_TEE("prompt should be given or interactive mode should be on");
        return NULL;
    }

-    auto * model = llava_init(params);
+    auto model = llava_init(params);
    if (model == NULL) {
        fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
        return NULL;
    }
    const int64_t t_llava_init_start_us = ggml_time_us();
-    auto * ctx_llava = llava_init_context(params, model);
+    auto ctx_llava = llava_init_context(params, model);
    ctx_llava->ctx_clip = ctx_clip;
    const int64_t t_llava_init_end_us = ggml_time_us();
    float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
-    LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
+    LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);

    const int64_t t_process_image_start_us = ggml_time_us();
    process_image(ctx_llava, embeds, params, n_past);
    const int64_t t_process_image_end_us = ggml_time_us();
    float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
-    LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
+    LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);

    llava_image_embed_free(embeds);
    return ctx_llava;
 }

-static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
+static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
    std::string user_prompt = prompt;
    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
    if (!is_first) {
@@ -235,7 +238,7 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par

    // generate the response

-    LOG_INF("\n");
+    LOG_TEE("\n");

    struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
    return smpl;
@@ -252,11 +255,16 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) {
        return 1;
    }

-    gpt_init();
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("llava", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+    llama_log_set(llama_log_callback_logTee, nullptr);
+#endif // LOG_DISABLE_LOGS

    if (params.mmproj.empty() || (params.image.empty())) {
        show_additional_info(argc, argv);
@@ -265,23 +273,21 @@ int main(int argc, char ** argv) {

    for (auto & image : params.image) {
        int n_past = 0;
-        auto * ctx_llava = minicpmv_init(&params, image, n_past);
+        auto ctx_llava = minicpmv_init(&params, image, n_past);

        if (!params.prompt.empty()) {
-            LOG("<user>%s\n", params.prompt.c_str());
-            LOG("<assistant>");
-            auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
+            LOG_TEE("<user>%s\n", params.prompt.c_str());
+            LOG_TEE("<assistant>");
+            auto smpl = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
            const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-            std::string response;
+            std::string response = "";
            bool have_tmp = false;
            for (int i = 0; i < max_tgt_len; i++) {
-                const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
+                auto tmp = llama_loop(ctx_llava, smpl, n_past);
                response += tmp;
                if (strcmp(tmp, "</s>") == 0){
-                    if (!have_tmp) {
-                        continue;
-                    }
-                    break;
+                    if(!have_tmp)continue;
+                    else break;
                }
                if (strstr(tmp, "###")) break; // Yi-VL behavior
                have_tmp = true;
@@ -293,15 +299,15 @@ int main(int argc, char ** argv) {
            gpt_sampler_free(smpl);
        }else {
            while (true) {
-                LOG("<user>");
+                LOG_TEE("<user>");
                std::string prompt;
                std::getline(std::cin, prompt);
-                LOG("<assistant>");
-                auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
+                LOG_TEE("<assistant>");
+                auto smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
                const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-                std::string response;
+                std::string response = "";
                for (int i = 0; i < max_tgt_len; i++) {
-                    const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
+                    auto tmp = llama_loop(ctx_llava, smpl, n_past);
                    response += tmp;
                    if (strcmp(tmp, "</s>") == 0) break;
                    if (strstr(tmp, "###")) break; // Yi-VL behavior
@@ -313,7 +319,7 @@ int main(int argc, char ** argv) {
            }
        }
        printf("\n");
-        llama_perf_context_print(ctx_llava->ctx_llama);
+        llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);

        ctx_llava->model = NULL;
        llava_free(ctx_llava);
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -1,7 +1,6 @@
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
-#include "log.h"
 #include "llama.h"

 #include <cstdio>
@@ -43,14 +42,18 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    gpt_init();
-
    const int W = 15; // lookahead window
    const int N = 5;  // n-gram size
    const int G = 15; // max verification n-grams

    const bool dump_kv_cache = params.dump_kv_cache;

+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("lookahead", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -72,14 +75,14 @@ int main(int argc, char ** argv) {
    const int max_tokens_list_size = max_context_size - 4;

    if ((int) inp.size() > max_tokens_list_size) {
-        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
        return 1;
    }

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    for (auto id : inp) {
-        LOG("%s", llama_token_to_piece(ctx, id).c_str());
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
    }

    fflush(stderr);
@@ -163,7 +166,7 @@ int main(int argc, char ** argv) {
        {
            const std::string token_str = llama_token_to_piece(ctx, id);

-            LOG("%s", token_str.c_str());
+            printf("%s", token_str.c_str());
            fflush(stdout);
        }
    }
@@ -253,7 +256,7 @@ int main(int argc, char ** argv) {
        }

        if (llama_decode(ctx, batch) != 0) {
-            LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
+            fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
            return 1;
        }

@@ -290,10 +293,10 @@ int main(int argc, char ** argv) {
                const std::string token_str = llama_token_to_piece(ctx, id);

                if (v == 0) {
-                    LOG("%s", token_str.c_str());
+                    printf("%s", token_str.c_str());
                } else {
                    // print light cyan
-                    LOG("\033[0;96m%s\033[0m", token_str.c_str());
+                    printf("\033[0;96m%s\033[0m", token_str.c_str());
                }
                fflush(stdout);

@@ -327,21 +330,21 @@ int main(int argc, char ** argv) {
            // print known n-grams starting with token id (debug)
            if (0 && v == 0) {
                if (ngrams_observed.cnt[id] > 0) {
-                    LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
+                    printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
                }

                for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
-                    LOG("   - ngram %2d: ", i);
+                    printf("   - ngram %2d: ", i);

                    const int idx = id*(N - 1)*G + i*(N - 1);

                    for (int j = 0; j < N - 1; j++) {
                        const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);

-                        LOG("%s", token_str.c_str());
+                        printf("%s", token_str.c_str());
                    }

-                    LOG("\n");
+                    printf("\n");
                }
            }

@@ -452,20 +455,20 @@ int main(int argc, char ** argv) {

    auto t_dec_end = ggml_time_us();

-    LOG("\n\n");
+    LOG_TEE("\n\n");

-    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));

-    LOG_INF("\n");
-    LOG_INF("W = %2d\n", W);
-    LOG_INF("N = %2d\n", N);
-    LOG_INF("G = %2d\n", G);
-    LOG_INF("\n");
-    LOG_INF("n_predict = %d\n", n_predict);
-    LOG_INF("n_accept  = %d\n", n_accept);
+    LOG_TEE("\n");
+    LOG_TEE("W = %2d\n", W);
+    LOG_TEE("N = %2d\n", N);
+    LOG_TEE("G = %2d\n", G);
+    LOG_TEE("\n");
+    LOG_TEE("n_predict = %d\n", n_predict);
+    LOG_TEE("n_accept  = %d\n", n_accept);

-    LOG_INF("\n");
+    LOG_TEE("\n");
    gpt_perf_print(ctx, smpl);

    gpt_sampler_free(smpl);
@@ -479,7 +482,7 @@ int main(int argc, char ** argv) {

    llama_backend_free();

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    return 0;
 }
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -5,12 +5,13 @@
 #include "llama.h"
 #include "ggml.h"

+#include <cmath>
 #include <cstdint>
 #include <cstdio>
-#include <cinttypes>
 #include <fstream>
 #include <string>
 #include <vector>
+#include <unordered_map>

 int main(int argc, char ** argv){
    gpt_params params;
@@ -19,8 +20,6 @@ int main(int argc, char ** argv){
        return 1;
    }

-    gpt_init();
-
    const int n_draft = params.n_draft;

    // init llama.cpp
@@ -50,7 +49,7 @@ int main(int argc, char ** argv){
            try {
                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
            } catch (std::ifstream::failure const &) {
-                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                exit(1);
            }
        }
@@ -129,7 +128,7 @@ int main(int argc, char ** argv){
            const int64_t eta_min  = eta_ms / (60*1000);
            const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;

-            LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
+            LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
        }

        // After each chunk, update the dynamic ngram cache with the context ngram cache:
@@ -137,24 +136,24 @@ int main(int argc, char ** argv){
        ngram_cache_context.clear();
    }

-    LOG("\n");
+    LOG_TEE("\n");

-    LOG_INF("\n");
-    LOG_INF("n_draft      = %d\n", n_draft);
-    LOG_INF("n_predict    = %d\n", n_input - n_input % n_ctx);
-    LOG_INF("n_drafted    = %d\n", n_drafted);
-    LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_INF("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+    LOG_TEE("\n");
+    LOG_TEE("n_draft      = %d\n", n_draft);
+    LOG_TEE("n_predict    = %d\n", n_input - n_input % n_ctx);
+    LOG_TEE("n_drafted    = %d\n", n_drafted);
+    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_INF("n_accept     = %d\n", n_accept);
-    LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_TEE("n_accept     = %d\n", n_accept);
+    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);

    llama_free(ctx);
    llama_free_model(model);

    llama_backend_free();

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    return 0;
 }
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -3,7 +3,6 @@
 #include "common.h"
 #include "ngram-cache.h"
 #include "sampling.h"
-#include "log.h"
 #include "llama.h"

 #include <cstdint>
@@ -19,13 +18,17 @@ int main(int argc, char ** argv){
        return 1;
    }

-    gpt_init();
-
    // max. number of additional tokens to draft if match is found
    const int n_draft = params.n_draft;

    const bool dump_kv_cache = params.dump_kv_cache;

+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("lookup", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -55,7 +58,7 @@ int main(int argc, char ** argv){
            try {
                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
            } catch (std::ifstream::failure const &) {
-                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                exit(1);
            }
        }
@@ -73,14 +76,14 @@ int main(int argc, char ** argv){
    const int max_tokens_list_size = max_context_size - 4;

    if ((int) inp.size() > max_tokens_list_size) {
-        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
        return 1;
    }

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    for (auto id : inp) {
-        LOG("%s", llama_token_to_piece(ctx, id).c_str());
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
    }

    fflush(stderr);
@@ -121,7 +124,7 @@ int main(int argc, char ** argv){
        }

        // print current draft sequence
-        LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
+        LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());

        int i_dft = 0;
        while (true) {
@@ -133,7 +136,7 @@ int main(int argc, char ** argv){
            const std::string token_str = llama_token_to_piece(ctx, id);

            if (!params.use_color) {
-                LOG("%s", token_str.c_str());
+                printf("%s", token_str.c_str());
            }

            if (llama_token_is_eog(model, id)) {
@@ -144,7 +147,7 @@ int main(int argc, char ** argv){

            // check if the target token matches the draft
            if (i_dft < (int) draft.size() && id == draft[i_dft]) {
-                LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
+                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
                ++n_accept;
                ++n_past;
                ++i_dft;
@@ -158,19 +161,19 @@ int main(int argc, char ** argv){

                if (params.use_color) {
                    // color accepted draft token
-                    LOG("\033[34m%s\033[0m", token_str.c_str());
+                    printf("\033[34m%s\033[0m", token_str.c_str());
                    fflush(stdout);
                }
                continue;
            }

            if (params.use_color) {
-                LOG("%s", token_str.c_str());
+                printf("%s", token_str.c_str());
            }
            fflush(stdout);


-            LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());

            draft.clear();
            draft.push_back(id);
@@ -221,23 +224,24 @@ int main(int argc, char ** argv){
    llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
    llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);

-    LOG("\n\n");
+    LOG_TEE("\n\n");

-    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));

-    LOG_INF("\n");
-    LOG_INF("n_draft      = %d\n", n_draft);
-    LOG_INF("n_predict    = %d\n", n_predict);
-    LOG_INF("n_drafted    = %d\n", n_drafted);
-    LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_INF("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+    LOG_TEE("\n");
+    LOG_TEE("n_draft      = %d\n", n_draft);
+    LOG_TEE("n_predict    = %d\n", n_predict);
+    LOG_TEE("n_drafted    = %d\n", n_drafted);
+    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_INF("n_accept     = %d\n", n_accept);
-    LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_TEE("n_accept     = %d\n", n_accept);
+    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);

-    LOG_INF("\ntarget:\n\n");
-    gpt_perf_print(ctx, smpl);
+    LOG_TEE("\ntarget:\n\n");
+    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);

    gpt_sampler_free(smpl);

@@ -248,7 +252,7 @@ int main(int argc, char ** argv){

    llama_backend_free();

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    return 0;
 }
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -161,8 +161,6 @@ A value of -1 will enable infinite text generation, even though we have a finite

 If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.

-The `--no-context-shift` option allows you to stop the infinite text generation once the finite context window is full.
-
 It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.

 ### Temperature
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -1,11 +1,12 @@
 #include "arg.h"
 #include "common.h"
 #include "console.h"
-#include "log.h"
 #include "sampling.h"
 #include "llama.h"

 #include <cassert>
+#include <cinttypes>
+#include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
@@ -41,13 +42,11 @@ static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting  = false;
 static bool need_insert_eot = false;

-static void print_usage(int argc, char ** argv) {
-    (void) argc;
-
-    LOG("\nexample usage:\n");
-    LOG("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
-    LOG("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
-    LOG("\n");
+static void print_usage(int, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
+    printf("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
+    printf("\n");
 }

 static bool file_exists(const std::string & path) {
@@ -75,7 +74,8 @@ static void write_logfile(

    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
-        LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
+        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+                __func__, params.logdir.c_str());
        return;
    }

@@ -83,7 +83,7 @@ static void write_logfile(
    FILE * logfile = fopen(logfile_path.c_str(), "w");

    if (logfile == NULL) {
-        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }

@@ -113,7 +113,7 @@ static void sigint_handler(int signo) {
            need_insert_eot = true;
        } else {
            console::cleanup();
-            LOG("\n");
+            printf("\n");
            gpt_perf_print(*g_ctx, *g_smpl);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            _exit(130);
@@ -122,11 +122,17 @@ static void sigint_handler(int signo) {
 }
 #endif

-static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
+static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    LOG_TEE("%s", text);
+}
+
+static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
    llama_chat_msg new_msg{role, content};
    auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
    chat_msgs.push_back({role, content});
-    LOG_DBG("formatted: '%s'\n", formatted.c_str());
+    LOG("formatted: %s\n", formatted.c_str());
    return formatted;
 }

@@ -137,46 +143,57 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    gpt_init();
-
    auto & sparams = params.sparams;

+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("main", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+    llama_log_set(llama_log_callback_logTee, nullptr);
+#endif // LOG_DISABLE_LOGS
+
+    // TODO: Dump params ?
+    //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
+
    // save choice to use color for later
    // (note for later: this is a slightly awkward choice)
    console::init(params.simple_io, params.use_color);
    atexit([]() { console::cleanup(); });

    if (params.logits_all) {
-        LOG_ERR("************\n");
-        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        LOG_ERR("************\n\n");
+        printf("\n************\n");
+        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        printf("************\n\n");

        return 0;
    }

    if (params.embedding) {
-        LOG_ERR("************\n");
-        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        LOG_ERR("************\n\n");
+        printf("\n************\n");
+        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        printf("************\n\n");

        return 0;
    }

    if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
    }

    if (params.rope_freq_base != 0.0) {
-        LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
    }

    if (params.rope_freq_scale != 0.0) {
-        LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }

-    LOG_INF("%s: llama backend init\n", __func__);
+    print_build_info();

+    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
+
+    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);

@@ -191,19 +208,21 @@ int main(int argc, char ** argv) {
    g_smpl = &smpl;

    // load the model and apply lora adapter, if any
-    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
+    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
    llama_init_result llama_init = llama_init_from_gpt_params(params);

    model = llama_init.model;
    ctx = llama_init.context;

    if (model == NULL) {
-        LOG_ERR("%s: error: unable to load model\n", __func__);
+        LOG_TEE("%s: error: unable to load model\n", __func__);
        return 1;
    }

-    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
-
+    LOG("%s: llama threadpool init = n_threads = %d\n",
+        __func__,
+        (int) params.cpuparams.n_threads
+    );
    struct ggml_threadpool_params tpp_batch =
            ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
    struct ggml_threadpool_params tpp =
@@ -215,8 +234,8 @@ int main(int argc, char ** argv) {
    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
        threadpool_batch = ggml_threadpool_new(&tpp_batch);
        if (!threadpool_batch) {
-            LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
-            return 1;
+            LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+            exit(1);
        }

        // Start the non-batch threadpool in the paused state
@@ -225,54 +244,55 @@ int main(int argc, char ** argv) {

    struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
    if (!threadpool) {
-        LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
-        return 1;
+        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        exit(1);
    }

    llama_attach_threadpool(ctx, threadpool, threadpool_batch);

    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);
+    LOG("n_ctx: %d\n", n_ctx);

    if (n_ctx > n_ctx_train) {
-        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
+        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
+                __func__, n_ctx_train, n_ctx);
    }

    // print chat template example in conversation mode
    if (params.conversation) {
        if (params.enable_chat_template) {
-            LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
+            LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
        } else {
-            LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
+            LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
        }
    }

    // print system information
    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
-        LOG_INF("\n");
+        LOG_TEE("\n");
+        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
    }

    std::string path_session = params.path_prompt_cache;
    std::vector<llama_token> session_tokens;

    if (!path_session.empty()) {
-        LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+        LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
        if (!file_exists(path_session)) {
-            LOG_INF("%s: session file does not exist, will create.\n", __func__);
+            LOG_TEE("%s: session file does not exist, will create.\n", __func__);
        } else if (file_is_empty(path_session)) {
-            LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
+            LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
        } else {
            // The file exists and is not empty
            session_tokens.resize(n_ctx);
            size_t n_token_count_out = 0;
            if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
-                LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
+                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
                return 1;
            }
            session_tokens.resize(n_token_count_out);
-            LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
+            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
        }
    }

@@ -280,8 +300,7 @@ int main(int argc, char ** argv) {
    if (!llama_model_has_encoder(model)) {
        GGML_ASSERT(!llama_add_eos_token(model));
    }
-
-    LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
+    LOG("add_bos: %d\n", add_bos);

    std::vector<llama_token> embd_inp;

@@ -290,31 +309,31 @@ int main(int argc, char ** argv) {
            ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
            : params.prompt;
        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
-            LOG_DBG("tokenize the prompt\n");
+            LOG("tokenize the prompt\n");
            embd_inp = ::llama_tokenize(ctx, prompt, true, true);
        } else {
-            LOG_DBG("use session tokens\n");
+            LOG("use session tokens\n");
            embd_inp = session_tokens;
        }

-        LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
-        LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
+        LOG("prompt: \"%s\"\n", log_tostr(prompt));
+        LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }

    // Should not run without any tokens
    if (embd_inp.empty()) {
        if (add_bos) {
            embd_inp.push_back(llama_token_bos(model));
-            LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
+            LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
        } else {
-            LOG_ERR("input is empty\n");
+            LOG_TEE("error: input is empty\n");
            return -1;
        }
    }

    // Tokenize negative prompt
    if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
    }

@@ -328,28 +347,29 @@ int main(int argc, char ** argv) {
            n_matching_session_tokens++;
        }
        if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
-            LOG_INF("%s: using full prompt from session file\n", __func__);
+            LOG_TEE("%s: using full prompt from session file\n", __func__);
        } else if (n_matching_session_tokens >= embd_inp.size()) {
-            LOG_INF("%s: session file has exact match for prompt!\n", __func__);
+            LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
-            LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
-                    __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
        } else {
-            LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
-                    __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
        }

        // remove any "future" tokens that we might have inherited from the previous session
        llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
    }

-    LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
-         embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
+    LOGLN(
+            "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu",
+            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size());

    // if we will use the cache for the full prompt without reaching the end of the cache, force
    // reevaluation of the last token to recalculate the cached logits
    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
-        LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
+        LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);

        session_tokens.resize(embd_inp.size() - 1);
    }
@@ -371,20 +391,21 @@ int main(int argc, char ** argv) {
    }

    if (params.verbose_prompt) {
-        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_TEE("\n");
+        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }

        if (params.n_keep > add_bos) {
-            LOG_INF("%s: static prompt based on n_keep: '", __func__);
+            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
            }
-            LOG("'\n");
+            LOG_TEE("'\n");
        }
-        LOG_INF("\n");
+        LOG_TEE("\n");
    }

    // ctrl+C handling
@@ -404,40 +425,40 @@ int main(int argc, char ** argv) {
    }

    if (params.interactive) {
-        LOG("%s: interactive mode on.\n", __func__);
+        LOG_TEE("%s: interactive mode on.\n", __func__);

        if (!params.antiprompt.empty()) {
            for (const auto & antiprompt : params.antiprompt) {
-                LOG("Reverse prompt: '%s'\n", antiprompt.c_str());
+                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
                if (params.verbose_prompt) {
                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
                    for (int i = 0; i < (int) tmp.size(); i++) {
-                        LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                    }
                }
            }
        }

        if (params.input_prefix_bos) {
-            LOG("Input prefix with BOS\n");
+            LOG_TEE("Input prefix with BOS\n");
        }

        if (!params.input_prefix.empty()) {
-            LOG("Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
            if (params.verbose_prompt) {
                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
                for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                }
            }
        }

        if (!params.input_suffix.empty()) {
-            LOG("Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
            if (params.verbose_prompt) {
                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
                for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                }
            }
        }
@@ -445,15 +466,13 @@ int main(int argc, char ** argv) {

    smpl = gpt_sampler_init(model, sparams);
    if (!smpl) {
-        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
-        return 1;
+        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
+        exit(1);
    }

-    LOG_INF("sampler seed: %u\n",     gpt_sampler_get_seed(smpl));
-    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
-    LOG_INF("sampler chain: %s\n",    gpt_sampler_print(smpl).c_str());
-
-    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
+    LOG_TEE(" sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
+    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);

    // group-attention state
    // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@@ -467,9 +486,9 @@ int main(int argc, char ** argv) {
        GGML_ASSERT(ga_w % ga_n == 0            && "grp_attn_w must be a multiple of grp_attn_n");     // NOLINT
      //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of grp_attn_w");    // NOLINT
      //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
-        LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
+        LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
    }
-    LOG("\n");
+    LOG_TEE("\n\n");

    if (params.interactive) {
        const char * control_message;
@@ -481,11 +500,11 @@ int main(int argc, char ** argv) {
                              " - To return control without starting a new line, end your input with '/'.\n"
                              " - If you want to submit another line, end your input with '\\'.\n";
        }
-        LOG("== Running in interactive mode. ==\n");
+        LOG_TEE("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG(       " - Press Ctrl+C to interject at any time.\n");
+        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG(       "%s\n", control_message);
+        LOG_TEE(       "%s\n", control_message);

        is_interacting = params.interactive_first;
    }
@@ -524,7 +543,7 @@ int main(int argc, char ** argv) {
        llama_token * enc_input_buf = embd_inp.data();

        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
-            LOG_ERR("%s : failed to eval\n", __func__);
+            LOG_TEE("%s : failed to eval\n", __func__);
            return 1;
        }

@@ -550,8 +569,9 @@ int main(int argc, char ** argv) {
                embd.resize(max_embd_size);

                console::set_display(console::error);
-                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                console::set_display(console::reset);
+                fflush(stdout);
            }

            if (ga_n == 1) {
@@ -559,35 +579,29 @@ int main(int argc, char ** argv) {
                // if we run out of context:
                // - take the n_keep first tokens from the original prompt (via n_past)
                // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-
                if (n_past + (int) embd.size() >= n_ctx) {
-                    if (!params.ctx_shift){
-                        LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
+                    if (params.n_predict == -2) {
+                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                        break;
-                    } else {
-                        if (params.n_predict == -2) {
-                            LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
-                            break;
-                        }
-
-                        const int n_left    = n_past - params.n_keep;
-                        const int n_discard = n_left/2;
-
-                        LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
-                                n_past, n_left, n_ctx, params.n_keep, n_discard);
-
-                        llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                        llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
-
-                        n_past -= n_discard;
-
-                        LOG_DBG("after swap: n_past = %d\n", n_past);
-
-                        LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
-
-                        LOG_DBG("clear session path\n");
-                        path_session.clear();
                    }
+
+                    const int n_left    = n_past - params.n_keep;
+                    const int n_discard = n_left/2;
+
+                    LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                            n_past, n_left, n_ctx, params.n_keep, n_discard);
+
+                    llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+
+                    n_past -= n_discard;
+
+                    LOG("after swap: n_past = %d\n", n_past);
+
+                    LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+
+                    LOG("clear session path\n");
+                    path_session.clear();
                }
            } else {
                // context extension via Self-Extend
@@ -596,10 +610,10 @@ int main(int argc, char ** argv) {
                    const int bd = (ga_w/ga_n)*(ga_n - 1);
                    const int dd = (ga_w/ga_n) - ib*bd - ga_w;

-                    LOG_DBG("\n");
-                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
-                    LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
-                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
+                    LOG("\n");
+                    LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
+                    LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
+                    LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);

                    llama_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
                    llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
@@ -609,7 +623,7 @@ int main(int argc, char ** argv) {

                    ga_i += ga_w/ga_n;

-                    LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
+                    LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
                }
            }

@@ -641,19 +655,19 @@ int main(int argc, char ** argv) {
                    n_eval = params.n_batch;
                }

-                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());

                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_ERR("%s : failed to eval\n", __func__);
+                    LOG_TEE("%s : failed to eval\n", __func__);
                    return 1;
                }

                n_past += n_eval;

-                LOG_DBG("n_past = %d\n", n_past);
+                LOG("n_past = %d\n", n_past);
                // Display total tokens alongside total time
                if (params.n_print > 0 && n_past % params.n_print == 0) {
-                    LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
+                    LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
                }
            }

@@ -671,14 +685,14 @@ int main(int argc, char ** argv) {
                need_to_save_session = false;
                llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());

-                LOG_DBG("saved session to %s\n", path_session.c_str());
+                LOG("saved session to %s\n", path_session.c_str());
            }

            const llama_token id = gpt_sampler_sample(smpl, ctx, -1);

-            gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
+            gpt_sampler_accept(smpl, id, /* apply_grammar= */ true);

-            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
+            // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());

            embd.push_back(id);

@@ -688,16 +702,16 @@ int main(int argc, char ** argv) {
            // decrement remaining sampling budget
            --n_remain;

-            LOG_DBG("n_remain: %d\n", n_remain);
+            LOG("n_remain: %d\n", n_remain);
        } else {
            // some user input remains from prompt or interaction, forward it to processing
-            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);

                // push the prompt in the sampling context in order to apply repetition penalties later
                // for the prompt, we don't apply grammar rules
-                gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
+                gpt_sampler_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false);

                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
@@ -712,7 +726,7 @@ int main(int argc, char ** argv) {
                const std::string token_str = llama_token_to_piece(ctx, id, params.special);

                // Console/Stream Output
-                LOG("%s", token_str.c_str());
+                fprintf(stdout, "%s", token_str.c_str());

                // Record Displayed Tokens To Log
                // Note: Generated tokens are created one by one hence this check
@@ -724,6 +738,8 @@ int main(int argc, char ** argv) {
                    output_tokens.push_back(id);
                    output_ss << token_str;
                }
+
+                fflush(stdout);
            }
        }

@@ -772,13 +788,13 @@ int main(int argc, char ** argv) {
                }

                if (is_antiprompt) {
-                    LOG_DBG("found antiprompt: %s\n", last_output.c_str());
+                    LOG("found antiprompt: %s\n", last_output.c_str());
                }
            }

            // deal with end of generation tokens in interactive mode
            if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
-                LOG_DBG("found an EOG token\n");
+                LOG("found an EOG token\n");

                if (params.interactive) {
                    if (!params.antiprompt.empty()) {
@@ -792,7 +808,7 @@ int main(int argc, char ** argv) {
                        chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
                    }
                    is_interacting = true;
-                    LOG("\n");
+                    printf("\n");
                }
            }

@@ -803,21 +819,21 @@ int main(int argc, char ** argv) {
            }

            if (n_past > 0 && is_interacting) {
-                LOG_DBG("waiting for user input\n");
+                LOG("waiting for user input\n");

                if (params.conversation) {
-                    LOG("\n> ");
+                    printf("\n> ");
                }

                if (params.input_prefix_bos) {
-                    LOG_DBG("adding input prefix BOS token\n");
+                    LOG("adding input prefix BOS token\n");
                    embd_inp.push_back(llama_token_bos(model));
                }

                std::string buffer;
                if (!params.input_prefix.empty() && !params.conversation) {
-                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    LOG("%s", params.input_prefix.c_str());
+                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                    printf("%s", params.input_prefix.c_str());
                }

                // color user input only
@@ -840,11 +856,11 @@ int main(int argc, char ** argv) {
                if (buffer.length() > 1) {
                    // append input suffix if any
                    if (!params.input_suffix.empty() && !params.conversation) {
-                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        LOG("%s", params.input_suffix.c_str());
+                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        printf("%s", params.input_suffix.c_str());
                    }

-                    LOG_DBG("buffer: '%s'\n", buffer.c_str());
+                    LOG("buffer: '%s'\n", buffer.c_str());

                    const size_t original_size = embd_inp.size();

@@ -861,7 +877,7 @@ int main(int argc, char ** argv) {
                    const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, format_chat);
                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);

-                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

                    // if user stop generation mid-way, we must add EOT to finish model's last response
                    if (need_insert_eot && format_chat) {
@@ -884,9 +900,9 @@ int main(int argc, char ** argv) {
                    assistant_ss.str("");

                    n_remain -= line_inp.size();
-                    LOG_DBG("n_remain: %d\n", n_remain);
+                    LOG("n_remain: %d\n", n_remain);
                } else {
-                    LOG_DBG("empty line, passing control back\n");
+                    LOG("empty line, passing control back\n");
                }

                input_echo = false; // do not echo this again
@@ -902,7 +918,7 @@ int main(int argc, char ** argv) {

        // end of generation
        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
-            LOG(" [end of text]\n");
+            LOG_TEE(" [end of text]\n");
            break;
        }

@@ -915,11 +931,11 @@ int main(int argc, char ** argv) {
    }

    if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
-        LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+        LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
        llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
    }

-    LOG("\n\n");
+    LOG_TEE("\n");
    gpt_perf_print(ctx, smpl);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

@@ -933,5 +949,9 @@ int main(int argc, char ** argv) {
    ggml_threadpool_free(threadpool);
    ggml_threadpool_free(threadpool_batch);

+#ifndef LOG_DISABLE_LOGS
+    LOG_TEE("Log end\n");
+#endif // LOG_DISABLE_LOGS
+
    return 0;
 }
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -4,7 +4,6 @@
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
-#include "log.h"
 #include "llama.h"

 #include <cmath>
@@ -84,9 +83,7 @@ static void print_date_time() {
    char buffer[80];
    strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);

-    LOG_INF("\n");
-    LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer);
-    LOG_INF("\n");
+    printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
 }

 // Define a split string function to ...
@@ -109,8 +106,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    gpt_init();
-
    // number of simultaneous "clients" to simulate
    const int32_t n_clients = params.n_parallel;

@@ -125,6 +120,12 @@ int main(int argc, char ** argv) {

    const bool dump_kv_cache = params.dump_kv_cache;

+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("parallel", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -137,22 +138,23 @@ int main(int argc, char ** argv) {

    // load the prompts from an external file if there are any
    if (params.prompt.empty()) {
-        LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
+        printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
    } else {
        // Output each line of the input params.prompts vector and copy to k_prompts
        int index = 0;
-        LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
+        printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());

        std::vector<std::string> prompts = split_string(params.prompt, '\n');
        for (const auto& prompt : prompts) {
            k_prompts.resize(index + 1);
            k_prompts[index] = prompt;
            index++;
-            LOG_INF("%3d prompt: %s\n", index, prompt.c_str());
+            printf("%3d prompt: %s\n", index, prompt.c_str());
        }
    }

-    LOG_INF("\n\n");
+    fprintf(stderr, "\n\n");
+    fflush(stderr);

    const int n_ctx = llama_n_ctx(ctx);

@@ -181,19 +183,19 @@ int main(int argc, char ** argv) {

    const auto t_main_start = ggml_time_us();

-    LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
-    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
-    LOG_INF("\n");
+    LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
+    LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+    LOG_TEE("\n");

    {
-        LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
+        LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);

        for (int32_t i = 0; i < n_tokens_system; ++i) {
            llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
        }

        if (llama_decode(ctx, batch) != 0) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
            return 1;
        }

@@ -202,10 +204,10 @@ int main(int argc, char ** argv) {
            llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
        }

-        LOG_INF("\n");
+        LOG_TEE("\n");
    }

-    LOG_INF("Processing requests ...\n\n");
+    LOG_TEE("Processing requests ...\n\n");

    while (true) {
        if (dump_kv_cache) {
@@ -236,7 +238,7 @@ int main(int argc, char ** argv) {
                llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
            }

-            LOG_INF("%s: clearing the KV cache\n", __func__);
+            LOG_TEE("%s: clearing the KV cache\n", __func__);
        }

        // insert new sequences for decoding
@@ -271,7 +273,7 @@ int main(int argc, char ** argv) {
                    client.n_decoded = 0;
                    client.i_batch   = batch.n_tokens - 1;

-                    LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
+                    LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);

                    g_seq_id += 1;

@@ -315,11 +317,11 @@ int main(int argc, char ** argv) {
            if (ret != 0) {
                if (n_batch == 1 || ret < 0) {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                    return 1;
                }

-                LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
+                LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);

                n_cache_miss += 1;

@@ -330,7 +332,7 @@ int main(int argc, char ** argv) {
                continue;
            }

-            LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
+            LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);

            for (auto & client : clients) {
                if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
@@ -375,7 +377,7 @@ int main(int argc, char ** argv) {

                    const auto t_main_end = ggml_time_us();

-                    LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
+                    LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
                            client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
                            (t_main_end - client.t_start_prompt) / 1e6,
                            (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
@@ -398,22 +400,22 @@ int main(int argc, char ** argv) {

    print_date_time();

-    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+    LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
    if (params.prompt_file.empty()) {
        params.prompt_file = "used built-in defaults";
    }
-    LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
-    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
+    LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
+    LOG_TEE("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());

-    LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
-    LOG_INF("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
-    LOG_INF("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
-    LOG_INF("Cache misses:        %6d\n", n_cache_miss);
+    LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
+    LOG_TEE("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
+    LOG_TEE("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
+    LOG_TEE("Cache misses:        %6d\n", n_cache_miss);

-    LOG_INF("\n");
+    LOG_TEE("\n");

    // TODO: print sampling/grammar timings for all clients
-    llama_perf_context_print(ctx);
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    llama_batch_free(batch);

@@ -422,7 +424,7 @@ int main(int argc, char ** argv) {

    llama_backend_free();

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    return 0;
 }
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -1,6 +1,5 @@
 #include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

 #include <cmath>
@@ -9,9 +8,9 @@
 #include <vector>

 static void print_usage(int, char ** argv) {
-    LOG("\nexample usage:\n");
-    LOG("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
-    LOG("\n");
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
+    LOG_TEE("\n");
 }

 int main(int argc, char ** argv) {
@@ -25,8 +24,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    gpt_init();
-
    int n_junk = params.n_junk;
    int n_keep = params.n_keep;
    int n_grp  = params.grp_attn_n;
@@ -66,7 +63,7 @@ int main(int argc, char ** argv) {
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n" , __func__);
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

@@ -80,7 +77,7 @@ int main(int argc, char ** argv) {

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
    if (ctx == NULL) {
-        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }

@@ -110,14 +107,14 @@ int main(int argc, char ** argv) {
    const int n_batch     = ctx_params.n_batch;
    const int n_batch_grp = ctx_params.n_batch/n_grp;

-    LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);

    // print the prompt token-by-token

-    LOG_INF("\n");
-    LOG_INF("prefix tokens: %d\n", n_tokens_prefix);
-    LOG_INF("prompt tokens: %d\n", n_tokens_all);
-    //LOG_INF("prompt: %s\n", params.prompt.c_str());
+    LOG_TEE("\n");
+    LOG_TEE("prefix tokens: %d\n", n_tokens_prefix);
+    LOG_TEE("prompt tokens: %d\n", n_tokens_all);
+    //LOG_TEE("prompt: %s\n", params.prompt.c_str());

    llama_batch batch = llama_batch_init(params.n_batch, 0, 1);

@@ -148,11 +145,11 @@ int main(int argc, char ** argv) {
        }

        if (llama_decode(ctx, batch) != 0) {
-            LOG_INF("%s: llama_decode() failed\n", __func__);
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
            return 1;
        }

-        LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
+        LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));

        if (i + n_batch >= n_tokens_all) {
            break;
@@ -162,7 +159,7 @@ int main(int argc, char ** argv) {
    for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
        const int n_discard = n_batch;

-        LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
+        LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);

        llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
        llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
@@ -182,18 +179,18 @@ int main(int argc, char ** argv) {
        }

        if (llama_decode(ctx, batch) != 0) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
            return 1;
        }

-        LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
+        LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
    }

    {
        const int n_discard = n_past - n_ctx + n_predict;

        if (n_discard > 0) {
-            LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
+            LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);

            llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
            llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
@@ -204,16 +201,17 @@ int main(int argc, char ** argv) {
        }
    }

-    LOG_INF("\n");
-    LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
-    LOG_INF("\n");
+    LOG_TEE("\n");
+    LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
+    LOG_TEE("\n");

    // main loop

    int n_cur    = n_tokens_all;
    int n_decode = 0;

-    LOG_INF("%s", prompt_suffix.c_str());
+    LOG_TEE("%s", prompt_suffix.c_str());
+    fflush(stdout);

    const auto t_main_start = ggml_time_us();

@@ -224,12 +222,13 @@ int main(int argc, char ** argv) {

            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
-                LOG("\n");
+                LOG_TEE("\n");

                break;
            }

-            LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+            fflush(stdout);

            n_decode += 1;

@@ -244,22 +243,22 @@ int main(int argc, char ** argv) {

        // evaluate the current batch with the transformer model
        if (llama_decode(ctx, batch)) {
-            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }
    }

-    LOG("\n");
+    LOG_TEE("\n");

    const auto t_main_end = ggml_time_us();

-    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    LOG("\n");
-    llama_perf_context_print(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

-    LOG("\n");
+    fprintf(stderr, "\n");

    llama_sampler_free(smpl);

--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,9 +1,7 @@
 #include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

-#include <algorithm>
 #include <array>
 #include <atomic>
 #include <cmath>
@@ -43,7 +41,7 @@ static void write_logfile(
    }

    if (params.hellaswag) {
-        LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
+        fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
        return;
    }

@@ -51,7 +49,7 @@ static void write_logfile(

    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
-        LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
+        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
        return;
    }
@@ -60,7 +58,7 @@ static void write_logfile(
    FILE * logfile = fopen(logfile_path.c_str(), "w");

    if (logfile == NULL) {
-        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }

@@ -346,16 +344,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));

-    LOG_INF("%s: tokenizing the input ..\n", __func__);
+    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);

    const int n_ctx = llama_n_ctx(ctx);

    if (int(tokens.size()) < 2*n_ctx) {
-        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
                n_ctx);
-        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }

@@ -366,16 +364,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    prob_history.resize(tokens.size());

    if (params.ppl_stride <= 0) {
-        LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
+        fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
        return {tokens, -1, logit_history, prob_history};
    }

    const int calc_chunk = n_ctx;

-    LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
+    fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);

    if (int(tokens.size()) <= calc_chunk) {
-        LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
+        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
                tokens.size(), n_ctx, params.ppl_stride);
        return {tokens, -1, logit_history, prob_history};
    }
@@ -389,14 +387,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    int count = 0;
    double nll = 0.0;

-    LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);

    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * params.ppl_stride;
        const int end   = start + calc_chunk;

        const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
-        //LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
+        //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);

        std::vector<float> logits;

@@ -409,10 +407,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);

-            //LOG_DBG("    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
+            //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
            // TODO: use llama_batch.logits instead of relying on logits_all == true
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                //LOG_ERR("%s : failed to eval\n", __func__);
+                //fprintf(stderr, "%s : failed to eval\n", __func__);
                return {tokens, -1, logit_history, prob_history};
            }

@@ -436,17 +434,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &

        if (i == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
            if (total_seconds >= 60*60) {
-                LOG("%d hours ", total_seconds / (60*60));
+                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            LOG("%.2f minutes\n", total_seconds / 60.0);
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }
-        LOG("\n");

-        //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
+        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
        for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {

            // Calculate probability of next token, given the previous ones.
@@ -463,12 +460,13 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
        }
        // perplexity is e^(average negative log-likelihood)
        if (params.ppl_output_type == 0) {
-            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
        } else {
-            LOG("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
+            printf("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
        }
+        fflush(stdout);
    }
-    LOG("\n");
+    printf("\n");

    return {tokens, std::exp(nll / count), logit_history, prob_history};
 }
@@ -490,26 +488,26 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    if (!params.logits_file.empty()) {
        logits_stream.open(params.logits_file.c_str(), std::ios::binary);
        if (!logits_stream.is_open()) {
-            LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
+            fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
            return {};
        }
-        LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
+        fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
        logits_stream.write("_logits_", 8);
        logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
    }

    auto tim1 = std::chrono::high_resolution_clock::now();
-    LOG_INF("%s: tokenizing the input ..\n", __func__);
+    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);

    auto tim2 = std::chrono::high_resolution_clock::now();
-    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

    if (int(tokens.size()) < 2*n_ctx) {
-        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
                n_ctx);
-        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }

@@ -542,7 +540,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        logits.reserve((size_t)n_ctx * n_vocab);
    }

-    LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
+    fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

@@ -615,7 +613,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            }

            if (llama_decode(ctx, batch)) {
-                LOG_INF("%s : failed to eval\n", __func__);
+                fprintf(stderr, "%s : failed to eval\n", __func__);
                return {tokens, -1, logit_history, prob_history};
            }

@@ -630,15 +628,14 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            llama_synchronize(ctx);
            const auto t_end = std::chrono::high_resolution_clock::now();
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total*n_chunk/n_seq);
            if (total_seconds >= 60*60) {
-                LOG("%d hours ", total_seconds / (60*60));
+                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            LOG("%.2f minutes\n", total_seconds / 60.0);
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }
-        LOG("\n");

        for (int seq = 0; seq < n_seq_batch; seq++) {
            const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
@@ -659,18 +656,19 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par

            // perplexity is e^(average negative log-likelihood)
            if (params.ppl_output_type == 0) {
-                LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
+                printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
            } else {
                double av = nll/count;
                double av2 = nll2/count - av*av;
                if (av2 > 0) av2 = sqrt(av2/(count-1));
-                LOG("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+                printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
            }
        }
+        fflush(stdout);

        logits.clear();
    }
-    LOG("\n");
+    printf("\n");

    nll2 /= count;
    nll /= count;
@@ -678,9 +676,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    nll2 -= nll * nll;
    if (nll2 > 0) {
        nll2 = sqrt(nll2/(count-1));
-        LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
    } else {
-        LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
+        printf("Unexpected negative standard deviation of log(prob)\n");
    }

    llama_batch_free(batch);
@@ -706,7 +704,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<

        const int ret = llama_decode(ctx, batch_view);
        if (ret != 0) {
-            LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+            LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
            return false;
        }

@@ -792,15 +790,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    }

    if (prompt_lines.size() % 6 != 0) {
-        LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
+        fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
        return;
    }

    size_t hs_task_count = prompt_lines.size()/6;
-    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
+    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);

    const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
-    LOG_INF("================================= is_spm = %d\n", is_spm);
+    fprintf(stderr, "================================= is_spm = %d\n", is_spm);

    // The tasks should be randomized so the score stabilizes quickly.
    bool randomize_tasks = true;
@@ -827,7 +825,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        std::vector<llama_token> seq_tokens[4];
    };

-    LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
+    fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );

    // Select and read data from prompt lines
    std::vector<hs_data_t> hs_data(hs_task_count);
@@ -873,9 +871,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        }
    }

-    LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
+    fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);

-    LOG("\ntask\tacc_norm\n");
+    printf("\ntask\tacc_norm\n");

    double acc = 0.0f;

@@ -943,7 +941,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        }

        if (i0 == i1) {
-            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
+            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
            return;
        }

@@ -951,7 +949,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {

        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
            return;
        }

@@ -1001,7 +999,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
                }
            }

-            //LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
+            //printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);

            // If the gold ending got the maximum logprobe add one accuracy point
            if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
@@ -1009,7 +1007,8 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            }

            // Print the accumulated accuracy mean x 100
-            LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
+            printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
+            fflush(stdout);
        }

        i0 = i1 - 1;
@@ -1017,7 +1016,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {

    llama_batch_free(batch);

-    LOG("\n");
+    printf("\n");
 }

 struct winogrande_entry {
@@ -1061,7 +1060,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
            }
        }
        if (ipos != 4) {
-            LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
+            printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
            continue;
        }
        auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
@@ -1075,13 +1074,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
            if (sentence[where] == '_') break;
        }
        if (where == int(sentence.size())) {
-            LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
+            printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
            continue;
        }
        std::istringstream stream(answer.c_str());
        int i_answer; stream >> i_answer;
        if (stream.fail() || i_answer < 1 || i_answer > 2) {
-            LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
+            printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
            continue;
        }
        result.emplace_back();
@@ -1110,14 +1109,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {

    auto data = load_winogrande_from_csv(params.prompt);
    if (data.empty()) {
-        LOG_ERR("%s: no tasks\n", __func__);
+        fprintf(stderr, "%s: no tasks\n", __func__);
        return;
    }

-    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
+    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());

    if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
-        LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
+        fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
        std::mt19937 rng(1);
        std::vector<int> aux(data.size());
        for (int i = 0; i < int(data.size()); ++i) {
@@ -1135,7 +1134,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
        data = std::move(selected);
    }

-    LOG_INF("%s : tokenizing selected tasks\n", __func__);
+    fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);

    for (auto & task : data) {
        task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
@@ -1158,7 +1157,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
        task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
    }

-    LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
+    fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);

    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    const int n_ctx   = llama_n_ctx(ctx);
@@ -1219,7 +1218,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
        }

        if (i0 == i1) {
-            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
+            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
            return;
        }

@@ -1227,7 +1226,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {

        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
            return;
        }

@@ -1287,20 +1286,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
            ++n_done;

            // print the accumulated accuracy mean x 100
-            LOG("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
+            printf("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
+            fflush(stdout);
        }

        i0 = i1 - 1;
    }

-    LOG("\n");
+    printf("\n");

    if (n_done < 100) return;

    const float p = 1.f*n_correct/n_done;
    const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
-
-    LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
+    printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
 }

 static bool deserialize_string(std::istream & in, std::string & str) {
@@ -1349,7 +1348,7 @@ struct multiple_choice_task {
 static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
    if (task.question.empty() || task.mc1.answers.empty()) {
        if (log_error) {
-            LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
+            printf("%s: found bad task with empty question and/or answers\n", __func__);
        }
        return false;
    }
@@ -1357,7 +1356,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
    for (auto& answer : task.mc1.answers) {
        if (answer.empty()) {
            if (log_error) {
-                LOG_ERR("%s: found empty answer\n", __func__);
+                printf("%s: found empty answer\n", __func__);
            }
            return false;
        }
@@ -1411,14 +1410,14 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
    uint32_t n_task;
    strstream.read((char *)&n_task, sizeof(n_task));
    if (strstream.fail() || n_task == 0) {
-        LOG_ERR("%s: no tasks\n", __func__);
+        printf("%s: no tasks\n", __func__);
        return;
    }
-    LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
+    printf("%s: there are %u tasks in prompt\n", __func__, n_task);
    std::vector<uint32_t> task_pos(n_task);
    strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
    if (strstream.fail()) {
-        LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
+        printf("%s: failed to read task positions from prompt\n", __func__);
        return;
    }

@@ -1426,21 +1425,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
    if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
        // Use all tasks
        tasks.resize(n_task);
-        LOG_INF("%s: reading tasks", __func__);
+        printf("%s: reading tasks", __func__);
        int n_dot = std::max((int) n_task/100, 1);
        int i = 0;
        for (auto& task : tasks) {
            ++i;
            if (!task.deserialize(strstream)) {
-                LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
+                printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
                return;
            }
-            if (i%n_dot == 0) LOG(".");
+            if (i%n_dot == 0) printf(".");
        }
-        LOG("done\n");
+        printf("done\n");
    }
    else {
-        LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
+        printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
        std::mt19937 rng(1);
        std::vector<int> aux(n_task);
        for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
@@ -1453,16 +1452,18 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
            aux.pop_back();
            strstream.seekg(task_pos[idx], std::ios::beg);
            if (!task.deserialize(strstream)) {
-                LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
+                printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
                return;
            }
        }
        n_task = params.multiple_choice_tasks;
    }

-    LOG_INF("%s: preparing task data", __func__);
+    printf("%s: preparing task data", __func__);
+    fflush(stdout);
    if (n_task > 500) {
-        LOG("...");
+        printf("...");
+        fflush(stdout);
        std::atomic<int> counter(0);
        std::atomic<int> n_bad(0);
        auto prepare = [&counter, &n_bad, &tasks, ctx] () {
@@ -1486,10 +1487,11 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        for (auto& w : workers) w = std::thread(prepare);
        prepare();
        for (auto& w : workers) w.join();
-        LOG("done\n");
+        printf("done\n");
+        fflush(stdout);
        int nbad = n_bad;
        if (nbad > 0) {
-            LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
+            printf("%s: found %d malformed tasks\n", __func__, nbad);
            return;
        }
    } else {
@@ -1501,15 +1503,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                return;
            }
            if (i_task%n_dot == 0) {
-                LOG(".");
+                printf(".");
+                fflush(stdout);
            }
        }
-        LOG("done\n");
+        printf("done\n");
    }

-    LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
+    printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());

-    LOG("\ntask\tacc_norm\n");
+    printf("\ntask\tacc_norm\n");

    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    const int n_ctx   = llama_n_ctx(ctx);
@@ -1588,7 +1591,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        }

        if (i0 == i1) {
-            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
+            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
            return;
        }

@@ -1596,7 +1599,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params

        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
            return;
        }

@@ -1620,13 +1623,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        // compute the logprobs for each ending of the decoded tasks
        for (size_t i = i0; i < i1; ++i) {
            auto & cur_task = tasks[i];
-            //LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
+            //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
            //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
            //    if (cur_task.mc1.labels[j] == 1) {
-            //        LOG("%d", j+1);
+            //        printf("%d", j+1);
            //    }
            //}
-            //LOG("\n    common_prefix: %zu\n", cur_task.common_prefix);
+            //printf("\n    common_prefix: %zu\n", cur_task.common_prefix);

            // get the logits of the last token of the common prefix
            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
@@ -1638,13 +1641,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                size_t count = 1;
                float  log_prob  = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
-                    //LOG("        %zu  %g\n", ir, eval_results[ir]);
+                    //printf("        %zu  %g\n", ir, eval_results[ir]);
                    ++count;
                    log_prob += eval_results[ir++];
                }
                cur_task.log_probs[s] = log_prob / count;
-                //LOG("        Final: %g\n", log_prob / count);
-                //LOG("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
+                //printf("        Final: %g\n", log_prob / count);
+                //printf("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
            }

            // Find the ending with maximum logprob
@@ -1664,7 +1667,8 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
            ++n_done;

            // Print the accumulated accuracy mean x 100
-            LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
+            printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
+            fflush(stdout);
        }

        i0 = i1 - 1;
@@ -1676,30 +1680,29 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params

    float p = 1.f*n_correct/n_done;
    float sigma = sqrt(p*(1-p)/(n_done-1));
-    LOG("\n");
-    LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
    p = 1.f*n_done/n_tot_answers;
    sigma = sqrt(p*(1-p)/(n_done-1));
-    LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);

-    LOG_INF("\n");
+    printf("\n");
 }

 static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    if (params.logits_file.empty()) {
-        LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
+        fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
        return;
    }
    std::ifstream in(params.logits_file.c_str(), std::ios::binary);
    if (!in) {
-        LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
+        fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
        return;
    }
    {
        char check[9]; check[8] = 0;
        in.read(check, 8);
        if (in.fail() || strncmp("_logits_", check, 8) != 0) {
-            LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
+            fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
            return;
        }
    }
@@ -1707,7 +1710,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    uint32_t n_ctx;
    in.read((char *)&n_ctx, sizeof(n_ctx));
    if (n_ctx > llama_n_ctx(ctx)) {
-        LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
+        fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
                __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
    }

@@ -1715,16 +1718,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    in.read((char *)&n_vocab, sizeof(n_vocab));
    in.read((char *)&n_chunk, sizeof(n_chunk));
    if (in.fail()) {
-        LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
+        fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
        return;
    }
    if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
-        LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
+        fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
    }

    std::vector<llama_token> tokens(n_ctx * n_chunk);
    if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
-        LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
+        fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
        return;
    }

@@ -1773,7 +1776,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        const auto t_start = std::chrono::high_resolution_clock::now();

        if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
-            LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
+            fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
            return;
        }

@@ -1794,7 +1797,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {

            // TODO: use llama_batch.logits instead of relying on logits_all == true
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                LOG_ERR("%s : failed to eval\n", __func__);
+                fprintf(stderr, "%s : failed to eval\n", __func__);
                return;
            }

@@ -1811,16 +1814,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {

        if (i == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
            if (total_seconds >= 60*60) {
-                LOG("%d hours ", total_seconds / (60*60));
+                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            LOG("%.2f minutes\n", total_seconds / 60.0);
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+
+            printf("\nchunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");
        }
-        LOG("\n");
-        LOG("chunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");

        const int first = n_ctx/2;
        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
@@ -1829,77 +1832,79 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        p_diff_ptr += n_ctx - 1 - first;
        kld_ptr    += n_ctx - 1 - first;

-        LOG("%4d", i+1);
+        printf("%4d", i+1);

        auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
        const double ppl_val = exp(log_ppl.first);
        const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
-        LOG("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);
+        printf("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);

        auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
        const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
        const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
        const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
-        LOG("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
+        printf("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);

        auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-        LOG("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
+        printf("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);

        auto p_diff_mse   = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
        const double p_diff_rms_val = sqrt(p_diff_mse.first);
        const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
-        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+        printf("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);

        double p_top_val = 1.*kld.n_same_top/kld.count;
        double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
-        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
+        printf("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);

-        LOG("\n");
+        printf("\n");
+
+        fflush(stdout);

        logits.clear();
    }
-    LOG("\n");
+    printf("\n");

    if (kld.count < 100) return; // we do not wish to do statistics on so few values

    std::sort(kld_values.begin(), kld_values.end());
    std::sort(p_diff_values.begin(), p_diff_values.end());

-    LOG("====== Perplexity statistics ======\n");
+    printf("====== Perplexity statistics ======\n");

    auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
    const double ppl_val = exp(log_ppl.first);
    const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
-    LOG("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
+    printf("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);

    auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
    const double ppl_base_val = exp(log_ppl_base.first);
    const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
-    LOG("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
+    printf("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);

    const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
-    // LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
+    // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
    const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
-    LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
+    printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);

    const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
    const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
-    LOG("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
+    printf("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);

    const double ppl_ratio_val = exp(log_ppl_ratio_val);
    const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
-    LOG("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
+    printf("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);

    const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
    const double ppl_diff_val = ppl_val - ppl_base_val;
    const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
-    LOG("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
+    printf("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);

-    LOG("\n");
+    printf("\n");

-    LOG("====== KL divergence statistics ======\n");
+    printf("====== KL divergence statistics ======\n");
    auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-    LOG("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
+    printf("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
    auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
                                               : kld_values[kld_values.size()/2];

@@ -1911,49 +1916,50 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
    };

-    LOG("Maximum KLD: %10.6f\n", kld_values.back());
-    LOG("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
-    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
-    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
-    LOG("Median  KLD: %10.6f\n", kld_median);
-    LOG("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
-    LOG(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
-    LOG(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
-    LOG("Minimum KLD: %10.6f\n", kld_values.front());
+    printf("Maximum KLD: %10.6f\n", kld_values.back());
+    printf("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
+    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    printf("Median  KLD: %10.6f\n", kld_median);
+    printf("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
+    printf(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
+    printf(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
+    printf("Minimum KLD: %10.6f\n", kld_values.front());

-    LOG("\n");
+    printf("\n");

-    LOG("====== Token probability statistics ======\n");
+    printf("====== Token probability statistics ======\n");

    auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
-    LOG("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);
+    printf("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);

    auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
                                               : p_diff_values[p_diff_values.size()/2];

-    LOG("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
-    LOG("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
-    LOG("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
-    LOG("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
-    LOG("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
-    LOG("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
-    LOG("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
-    LOG("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
-    LOG("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
-    LOG(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
-    LOG(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
-    LOG(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
-    LOG("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());
+    printf("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
+    printf("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
+    printf("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
+    printf("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
+    printf("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
+    printf("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
+    printf("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
+    printf("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
+    printf("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
+    printf(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
+    printf(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
+    printf(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
+    printf("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());

    auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
-    // LOG("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
+    // printf("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);

    const double p_diff_rms_val = sqrt(p_diff_mse.first);
    const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
-    LOG("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+    printf("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);

    const double same_top_p = 1.0*kld.n_same_top/kld.count;
-    LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
+    printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
+
 }

 int main(int argc, char ** argv) {
@@ -1966,12 +1972,10 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    gpt_init();
-
    const int32_t n_ctx = params.n_ctx;

    if (n_ctx <= 0) {
-        LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
+        fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
        return 1;
    }

@@ -1996,11 +2000,15 @@ int main(int argc, char ** argv) {
    }

    if (params.ppl_stride > 0) {
-        LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
+        fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
                params.n_ctx, params.n_ctx + params.ppl_stride/2);
        params.n_ctx += params.ppl_stride/2;
    }

+    print_build_info();
+
+    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
+
    llama_backend_init();
    llama_numa_init(params.numa);

@@ -2010,21 +2018,21 @@ int main(int argc, char ** argv) {
    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;
    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n", __func__);
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        return 1;
    }

    const int n_ctx_train = llama_n_ctx_train(model);

    if (params.n_ctx > n_ctx_train) {
-        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
+        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, params.n_ctx);
    }

    // print system information
    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }

    struct results_perplexity results;
@@ -2040,9 +2048,8 @@ int main(int argc, char ** argv) {
        results = perplexity(ctx, params, n_ctx);
    }

-    LOG("\n");
-    llama_perf_context_print(ctx);
-
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
    write_logfile(ctx, params, model, results);

    llama_free(ctx);
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -1,16 +1,14 @@
 #include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

 #include <algorithm>
 #include <fstream>
-#include <iostream> // TODO: remove me

 static void print_usage(int, char ** argv) {
-    LOG("\nexample usage:\n");
-    LOG("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
-    LOG("\n");
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
+    LOG_TEE("\n");
 }

 struct chunk {
@@ -19,7 +17,7 @@ struct chunk {
    // original file position
    size_t filepos;
    // original text data
-    std::string textdata;
+    std::string textdata = "";
    // tokenized text data
    std::vector<llama_token> tokens;
    // embedding
@@ -33,14 +31,14 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
    std::ifstream f(filename.c_str());

    if (!f.is_open()) {
-        LOG_ERR("could not open file %s\n", filename.c_str());
+        fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
        return chunks;
    }

    chunk current_chunk;
    char buffer[1024];
    int64_t filepos = 0;
-    std::string current;
+    std::string current = "";
    while (f.read(buffer, 1024)) {
        current += std::string(buffer, f.gcount());
        size_t pos;
@@ -86,9 +84,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
    llama_kv_cache_clear(ctx);

    // run model
-    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
    if (llama_decode(ctx, batch) < 0) {
-        LOG_ERR("%s : failed to decode\n", __func__);
+        fprintf(stderr, "%s : failed to decode\n", __func__);
    }

    for (int i = 0; i < batch.n_tokens; i++) {
@@ -101,7 +99,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
        if (embd == NULL) {
            embd = llama_get_embeddings_ith(ctx, i);
            if (embd == NULL) {
-                LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
+                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
                continue;
            }
        }
@@ -118,24 +116,24 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    gpt_init();
-
    // For BERT models, batch size must be equal to ubatch size
    params.n_ubatch = params.n_batch;
    params.embedding = true;

    if (params.chunk_size <= 0) {
-        LOG_ERR("chunk_size must be positive\n");
+        fprintf(stderr, "chunk_size must be positive\n");
        return 1;
    }
    if (params.context_files.empty()) {
-        LOG_ERR("context_files must be specified\n");
+        fprintf(stderr, "context_files must be specified\n");
        return 1;
    }

-    LOG_INF("processing files:\n");
+    print_build_info();
+
+    printf("processing files:\n");
    for (auto & context_file : params.context_files) {
-        LOG_INF("%s\n", context_file.c_str());
+        printf("%s\n", context_file.c_str());
    }

    std::vector<chunk> chunks;
@@ -143,7 +141,7 @@ int main(int argc, char ** argv) {
        std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
        chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
    }
-    LOG_INF("Number of chunks: %ld\n", chunks.size());
+    printf("Number of chunks: %ld\n", chunks.size());

    llama_backend_init();
    llama_numa_init(params.numa);
@@ -155,7 +153,7 @@ int main(int argc, char ** argv) {
    llama_context * ctx = llama_init.context;

    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n", __func__);
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        return 1;
    }

@@ -164,19 +162,19 @@ int main(int argc, char ** argv) {

    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        LOG_ERR("%s: pooling type NONE not supported\n", __func__);
+        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
        return 1;
    }

    if (n_ctx > n_ctx_train) {
-        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, n_ctx);
    }

    // print system information
    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }

    // max batch size
@@ -187,7 +185,7 @@ int main(int argc, char ** argv) {
    for (auto & chunk : chunks) {
        auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
        if (inp.size() > n_batch) {
-            LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+            fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                    __func__, (long long int) inp.size(), (long long int) n_batch);
            return 1;
        }
@@ -201,12 +199,12 @@ int main(int argc, char ** argv) {
    // tokenization stats
    if (params.verbose_prompt) {
        for (int i = 0; i < (int) chunks.size(); i++) {
-            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
-            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
+            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
+            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
            for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
-                LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
+                fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
            }
-            LOG_INF("\n\n");
+            fprintf(stderr, "\n\n");
        }
    }

@@ -258,7 +256,7 @@ int main(int argc, char ** argv) {
    // start loop, receive query and return top k similar chunks based on cosine similarity
    std::string query;
    while (true) {
-        LOG("Enter query: ");
+        printf("Enter query: ");
        std::getline(std::cin, query);
        std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);

@@ -282,19 +280,19 @@ int main(int argc, char ** argv) {
                return a.second > b.second;
            });

-            LOG("Top %d similar chunks:\n", params.sparams.top_k);
+            printf("Top %d similar chunks:\n", params.sparams.top_k);
            for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
-                LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
-                LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
-                LOG("similarity: %f\n", similarities[i].second);
-                LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
-                LOG("--------------------\n");
+                printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
+                printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
+                printf("similarity: %f\n", similarities[i].second);
+                printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
+                printf("--------------------\n");
            }
        }
    }

-    LOG("\n");
-    llama_perf_context_print(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    // clean up
    llama_batch_free(query_batch);
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-server)
-
-option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
+option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
+option(LLAMA_SERVER_SSL     "Build SSL support for the server"        OFF)

 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})

@@ -30,7 +30,6 @@ set(PUBLIC_ASSETS
    system-prompts.js
    prompt-formats.js
    json-schema-to-grammar.mjs
-    loading.html
 )

 foreach(asset ${PUBLIC_ASSETS})
@@ -46,6 +45,9 @@ endforeach()

 add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
+target_compile_definitions(${TARGET} PRIVATE
+    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
+)

 target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})

--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -121,6 +121,7 @@ The project is under active development, and we are [looking for feedback and co
 | `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
+| `--log-format {text, json}` | log output format: json or text (default: json) |
 | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
 | `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
@@ -406,44 +407,9 @@ Notice that each `probs` is an array of length `n_probs`.

    *Options:*

-    `content`: (Required) The text to tokenize.
+    `content`: Set the text to tokenize.

-    `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
-
-    `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`
-
-**Response:**
-
-Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
-
-
-If `with_pieces` is `false`:
-```json
-{
-  "tokens": [123, 456, 789]
-}
-```
-
-If `with_pieces` is `true`:
-```json
-{
-  "tokens": [
-    {"id": 123, "piece": "Hello"},
-    {"id": 456, "piece": " world"},
-    {"id": 789, "piece": "!"}
-  ]
-}
-```
-
-With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
-```json
-{
-  "tokens": [
-    {"id": 198, "piece": [195]}, // hex C3
-    {"id": 164, "piece": [161]} // hex A1
-  ]
-}
-```
+    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`

 ### POST `/detokenize`: Convert tokens to text

--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -40,6 +40,7 @@ server --host localhost --port 8080 \
  --parallel 8 \
  --batch-size 512 \
  --ctx-size 4096 \
+  --log-format text \
  -ngl 33
 ```

--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -272,6 +272,7 @@ def start_server_background(args):
    server_args.append('--cont-batching')
    server_args.append('--metrics')
    server_args.append('--flash-attn')
+    server_args.extend(['--log-format', "text"])
    args = [str(arg) for arg in [server_path, *server_args]]
    print(f"bench: starting server with: {' '.join(args)}")
    pkwargs = {
--- a/examples/server/public/loading.html
+++ b/examples/server/public/loading.html
@@ -1,12 +0,0 @@
-<!DOCTYPE html>
-<html>
-    <head>
-        <meta http-equiv="refresh" content="5">
-    </head>
-    <body>
-        <div id="loading">
-            The model is loading. Please wait.<br/>
-            The user interface will appear soon.
-        </div>
-    </body>
-</html>
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/tests/.gitignore
+++ b/examples/server/tests/.gitignore
@@ -1 +0,0 @@
-.venv
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -40,6 +40,7 @@ It's possible to override some scenario steps values with environment variables:
 | `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
 | `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/llama-server`                         |
 | `DEBUG`                  | "ON" to enable steps and server verbose mode `--verbose`                                       |
+| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format                                                       |
 | `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |

 ### Run @bug, @wip or @wrong_usage annotated scenario
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -105,14 +105,6 @@ Feature: llama.cpp server
    Given first token is removed
    Then  tokens can be detokenized

-  Scenario: Tokenize with pieces
-    When  tokenizing with pieces:
-    """
-    What is the capital of Germany?
-    媽
-    """
-    Then  tokens are given with pieces
-
  Scenario: Models available
    Given available models
    Then  1 models are supported
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -1,6 +1,3 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
 import asyncio
 import json
 import os
@@ -700,32 +697,6 @@ def step_tokenize_set_add_special(context):
    context.tokenize_add_special = True


-@step("tokenizing with pieces")
-@async_run_until_complete
-async def step_tokenize_with_pieces(context):
-    context.tokenized_text = context_text(context)
-    async with aiohttp.ClientSession() as session:
-        tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
-        if getattr(context, "tokenize_add_special", None) is not None:
-            tokenize_args["add_special"] = context.tokenize_add_special
-
-        async with session.post(
-            f"{context.base_url}/tokenize", json=tokenize_args
-        ) as response:
-            assert response.status == 200
-            tokenize_json = await response.json()
-            context.tokens_with_pieces = tokenize_json["tokens"]
-
-
-@step("tokens are given with pieces")
-@async_run_until_complete
-async def step_tokenize_with_pieces(context):
-    # Verify that the response contains both token IDs and pieces
-    assert all(
-        "id" in token and "piece" in token for token in context.tokens_with_pieces
-    )
-
-
@step('tokenizing')
@async_run_until_complete
 async def step_tokenize(context):
@@ -1020,8 +991,6 @@ async def oai_chat_completions(user_prompt,
                            event_data = line.split(': ', 1)
                            assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```'
                            chunk_raw = event_data[1]
-                            if chunk_raw == '[DONE]':
-                                break

                            chunk = json.loads(chunk_raw)
                            assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```"
@@ -1372,6 +1341,8 @@ def start_server_background(context):
        server_args.append('--verbose')
    if context.lora_file:
        server_args.extend(['--lora', context.lora_file])
+    if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
+        server_args.extend(['--log-format', "text"])

    args = [str(arg) for arg in [context.server_path, *server_args]]
    print(f"bench: starting server with: {' '.join(args)}")
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -1,8 +1,7 @@
 #pragma once

-#include "common.h"
-#include "log.h"
 #include "llama.h"
+#include "common.h"

 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@@ -16,10 +15,10 @@
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"

-#include <random>
-#include <sstream>
 #include <string>
 #include <vector>
+#include <sstream>
+#include <random>

 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"

@@ -36,6 +35,32 @@ enum error_type {
    ERROR_TYPE_NOT_SUPPORTED, // custom error
 };

+extern bool server_verbose;
+extern bool server_log_json;
+
+#ifndef SERVER_VERBOSE
+#define SERVER_VERBOSE 1
+#endif
+
+#if SERVER_VERBOSE != 1
+#define LOG_VERBOSE(MSG, ...)
+#else
+#define LOG_VERBOSE(MSG, ...)                                            \
+    do                                                                   \
+    {                                                                    \
+        if (server_verbose)                                              \
+        {                                                                \
+            server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
+        }                                                                \
+    } while (0)
+#endif
+
+#define LOG_ERROR(  MSG, ...) server_log("ERR",  __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
+
+static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
+
 template <typename T>
 static T json_value(const json & body, const std::string & key, const T & default_value) {
    // Fallback null to default value
@@ -43,7 +68,9 @@ static T json_value(const json & body, const std::string & key, const T & defaul
        try {
            return body.at(key);
        } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
-            LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
+            std::stringstream ss;
+            ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
+            LOG_WARNING(ss.str().c_str(), body);
            return default_value;
        }
    } else {
@@ -51,6 +78,48 @@ static T json_value(const json & body, const std::string & key, const T & defaul
    }
 }

+static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
+    std::stringstream ss_tid;
+    ss_tid << std::this_thread::get_id();
+    json log = json{
+        {"tid",       ss_tid.str()},
+        {"timestamp", time(nullptr)},
+    };
+
+    if (server_log_json) {
+        log.merge_patch({
+            {"level",    level},
+            {"function", function},
+            {"line",     line},
+            {"msg",      message},
+        });
+
+        if (!extra.empty()) {
+            log.merge_patch(extra);
+        }
+
+        printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
+    } else {
+        char buf[1024];
+        snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
+
+        if (!extra.empty()) {
+            log.merge_patch(extra);
+        }
+        std::stringstream ss;
+        ss << buf << " |";
+        for (const auto & el : log.items())
+        {
+            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
+            ss << " " << el.key() << "=" << value;
+        }
+
+        const std::string str = ss.str();
+        printf("%.*s\n", (int)str.size(), str.data());
+    }
+    fflush(stdout);
+}
+
 //
 // chat template utils
 //
@@ -84,9 +153,8 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
        chat.push_back({role, content});
    }

-    const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
-    LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
-
+    auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
+    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
    return formatted_chat;
 }

@@ -175,7 +243,10 @@ static std::string random_string() {
 }

 static std::string gen_chatcmplid() {
-    return "chatcmpl-" + random_string();
+    std::stringstream chatcmplid;
+    chatcmplid << "chatcmpl-" << random_string();
+
+    return chatcmplid.str();
 }

 //
@@ -216,7 +287,7 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
    return std::string::npos;
 }

-static bool json_is_array_of_numbers(const json & data) {
+static bool json_is_array_of_numbers(json data) {
    if (data.is_array()) {
        for (const auto & e : data) {
            if (!e.is_number()) {
@@ -292,13 +363,15 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
    return out;
 }

-static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
+static bool server_sent_event(httplib::DataSink & sink, const char * event, json & data) {
    const std::string str =
        std::string(event) + ": " +
        data.dump(-1, ' ', false, json::error_handler_t::replace) +
-        "\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
+        "\n\n";

-    LOG_DBG("data stream, to_send: %s", str.c_str());
+    LOG_VERBOSE("data stream", {
+        { "to_send", str }
+    });

    return sink.write(str.c_str(), str.size());
 }
@@ -352,7 +425,7 @@ static json oaicompat_completion_params_parse(

    // Params supported by OAI but unsupported by llama.cpp
    static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
-    for (const auto & param : unsupported_params) {
+    for (auto & param : unsupported_params) {
        if (body.contains(param)) {
            throw std::runtime_error("Unsupported param: " + param);
        }
@@ -371,7 +444,7 @@ static json oaicompat_completion_params_parse(
    return llama_params;
 }

-static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
+static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
    bool stopped_word        = result.count("stopped_word") != 0;
    bool stopped_eos         = json_value(result, "stopped_eos", false);
    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
@@ -408,8 +481,7 @@ static json format_final_response_oaicompat(const json & request, const json & r
        {"id", completion_id}
    };

-    // extra fields for debugging purposes
-    if (verbose) {
+    if (server_verbose) {
        res["__verbose"] = result;
    }

@@ -421,7 +493,7 @@ static json format_final_response_oaicompat(const json & request, const json & r
 }

 // return value is vector as there is one case where we might need to generate two responses
-static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
+static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
        return std::vector<json>({result});
    }
@@ -523,7 +595,7 @@ static std::vector<json> format_partial_response_oaicompat(const json & result,
 static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
    json data = json::array();
    int i = 0;
-    for (const auto & elem : embeddings) {
+    for (auto & elem : embeddings) {
        data.push_back(json{
            {"embedding", json_value(elem, "embedding", json::array())},
            {"index",     i++},
@@ -544,40 +616,7 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
    return res;
 }

-static bool is_valid_utf8(const std::string & str) {
-    const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
-    const unsigned char* end = bytes + str.length();
-
-    while (bytes < end) {
-        if (*bytes <= 0x7F) {
-            // 1-byte sequence (0xxxxxxx)
-            bytes++;
-        } else if ((*bytes & 0xE0) == 0xC0) {
-            // 2-byte sequence (110xxxxx 10xxxxxx)
-            if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
-                return false;
-            bytes += 2;
-        } else if ((*bytes & 0xF0) == 0xE0) {
-            // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
-            if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
-                return false;
-            bytes += 3;
-        } else if ((*bytes & 0xF8) == 0xF0) {
-            // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
-            if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
-                (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
-                return false;
-            bytes += 4;
-        } else {
-            // Invalid UTF-8 lead byte
-            return false;
-        }
-    }
-
-    return true;
-}
-
-static json format_tokenizer_response(const json & tokens) {
+static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
    return json {
        {"tokens", tokens}
    };
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -1,14 +1,16 @@
 #include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

+#include <cmath>
+#include <cstdio>
+#include <string>
 #include <vector>

 static void print_usage(int, char ** argv) {
-    LOG("\nexample usage:\n");
-    LOG("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
-    LOG("\n");
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
+    LOG_TEE("\n");
 }

 int main(int argc, char ** argv) {
@@ -21,8 +23,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    gpt_init();
-
    // total length of the sequence including the prompt
    const int n_predict = params.n_predict;

@@ -69,24 +69,25 @@ int main(int argc, char ** argv) {
    const int n_ctx    = llama_n_ctx(ctx);
    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());

-    LOG("\n");
-    LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
+    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);

    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx) {
-        LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
-        LOG_ERR("%s:        either reduce n_predict or increase n_ctx\n", __func__);
+        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
+        LOG_TEE("%s:        either reduce n_predict or increase n_ctx\n", __func__);
        return 1;
    }

    // print the prompt token-by-token

-    LOG("\n");
+    fprintf(stderr, "\n");

    for (auto id : tokens_list) {
-        LOG("%s", llama_token_to_piece(ctx, id).c_str());
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
    }

+    fflush(stderr);
+
    // create a llama_batch with size 512
    // we use this object to submit token data for decoding

@@ -101,7 +102,7 @@ int main(int argc, char ** argv) {
    batch.logits[batch.n_tokens - 1] = true;

    if (llama_decode(ctx, batch) != 0) {
-        LOG("%s: llama_decode() failed\n", __func__);
+        LOG_TEE("%s: llama_decode() failed\n", __func__);
        return 1;
    }

@@ -115,16 +116,16 @@ int main(int argc, char ** argv) {
    while (n_cur <= n_predict) {
        // sample the next token
        {
-            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1);
+            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);

            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
-                LOG("\n");
+                LOG_TEE("\n");

                break;
            }

-            LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
            fflush(stdout);

            // prepare the next batch
@@ -140,23 +141,23 @@ int main(int argc, char ** argv) {

        // evaluate the current batch with the transformer model
        if (llama_decode(ctx, batch)) {
-            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }
    }

-    LOG("\n");
+    LOG_TEE("\n");

    const auto t_main_end = ggml_time_us();

-    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    LOG("\n");
-    llama_perf_sampler_print(smpl);
-    llama_perf_context_print(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);

-    LOG("\n");
+    fprintf(stderr, "\n");

    llama_batch_free(batch);
    llama_sampler_free(smpl);
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -1,16 +1,13 @@
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
-#include "log.h"
 #include "llama.h"

-#include <algorithm>
 #include <cstdio>
-#include <cstring>
-#include <random>
-#include <set>
 #include <string>
 #include <vector>
+#include <set>
+#include <random>

 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -36,10 +33,8 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    gpt_init();
-
    if (params.model_draft.empty()) {
-        LOG_ERR("%s: --model-draft is required\n", __func__);
+        fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
        return 1;
    }

@@ -52,6 +47,12 @@ int main(int argc, char ** argv) {
    std::default_random_engine rng(params.sparams.seed);
    std::uniform_real_distribution<> u_dist;

+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("speculative", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -80,14 +81,14 @@ int main(int argc, char ** argv) {
    ctx_dft = llama_init_dft.context;

    const bool vocab_type_tgt = llama_vocab_type(model_tgt);
-    LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
+    LOG("vocab_type tgt: %d\n", vocab_type_tgt);

    const bool vocab_type_dft = llama_vocab_type(model_dft);
-    LOG_DBG("vocab_type dft: %d\n", vocab_type_dft);
+    LOG("vocab_type dft: %d\n", vocab_type_dft);

    if (vocab_type_tgt != vocab_type_dft) {
-        LOG_ERR("%s: draft model vocab type must match target model to use speculation but ", __func__);
-        LOG_ERR("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
+        fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
+        fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
        return 1;
    }

@@ -97,7 +98,7 @@ int main(int argc, char ** argv) {
        llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
        llama_token_eos(model_tgt) != llama_token_eos(model_dft)
    ) {
-        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
+        fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
        return 1;
    }

@@ -109,8 +110,8 @@ int main(int argc, char ** argv) {
            : n_vocab_dft - n_vocab_tgt;

        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
-            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
-            LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+            fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
+            fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
                    n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
            return 1;
        }
@@ -119,8 +120,8 @@ int main(int argc, char ** argv) {
            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
            const char * token_text_dft = llama_token_get_text(model_dft, i);
            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
-                LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
-                LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
+                fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
+                fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
                        llama_token_to_piece(ctx_tgt, i).c_str(),
                        llama_token_to_piece(ctx_dft, i).c_str());
                return 1;
@@ -137,16 +138,18 @@ int main(int argc, char ** argv) {
    const int max_tokens_list_size = max_context_size - 4;

    if ((int) inp.size() > max_tokens_list_size) {
-        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
        return 1;
    }

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    for (auto id : inp) {
-        LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str());
+        fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
    }

+    fflush(stderr);
+
    const int n_input = inp.size();

    const auto t_enc_start = ggml_time_us();
@@ -208,7 +211,7 @@ int main(int argc, char ** argv) {
            active_seqs.insert(s);
            const auto & tokens = drafts[s].tokens;

-            LOG_DBG("draft %d: %s\n", s, string_from(ctx_dft, tokens).c_str());
+            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
        }

        int i_dft  = 0;
@@ -251,7 +254,7 @@ int main(int argc, char ** argv) {
                            continue;
                        }

-                        LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
+                        LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
                        float r = u_dist(rng);
                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };

@@ -269,7 +272,7 @@ int main(int argc, char ** argv) {
                                break;
                            }
                        }
-                        LOG_DBG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
+                        LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
                        if (r <= p_tgt / p_dft) {
                            s_keep = s;
                            accept = true;
@@ -277,10 +280,10 @@ int main(int argc, char ** argv) {
                            token_str = llama_token_to_piece(ctx_tgt, token_id);
                            gpt_sampler_accept(smpl, token_id, true);

-                            LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
+                            LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
                            break;
                        } else {
-                            LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
+                            LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
                            drafts[s].active = false;

                            // calculate residual probability
@@ -335,7 +338,7 @@ int main(int argc, char ** argv) {
                    if (!accept) {
                        // all drafted tokens were rejected
                        // sample from the target model
-                        LOG_DBG("all drafted tokens were rejected, sampling from residual distribution\n");
+                        LOG("all drafted tokens were rejected, sampling from residual distribution\n");
                        std::vector<float> probs(dist_tgt.size);
                        for (size_t i = 0; i < dist_tgt.size; ++i) {
                            probs[i] = dist_tgt.data[i].p;
@@ -353,11 +356,13 @@ int main(int argc, char ** argv) {
                    // greedy verification

                    // sample from the target model
-                    LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
+                    LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
                    token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);

                    gpt_sampler_accept(smpl, token_id, true);

+                    //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str());
+
                    token_str = llama_token_to_piece(ctx_tgt, token_id);

                    for (int s = 0; s < n_seq_dft; ++s) {
@@ -366,7 +371,7 @@ int main(int argc, char ** argv) {
                        }

                        if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
-                            LOG_DBG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
+                            LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());

                            s_keep = s;
                            accept = true;
@@ -388,24 +393,26 @@ int main(int argc, char ** argv) {
                    ++i_dft;
                    if (params.use_color) {
                        // Color token according to its origin sequence
-                        LOG("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
+                        printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
                    } else {
-                        LOG("%s", token_str.c_str());
+                        printf("%s", token_str.c_str());
                    }
+                    fflush(stdout);
                    continue;
                } else {
-                    LOG("%s", token_str.c_str());
+                    printf("%s", token_str.c_str());
+                    fflush(stdout);
                    break;
                }
            }
        }

        {
-            LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
+            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());

            // TODO: simplify
            {
-                LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
+                LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);

                llama_kv_cache_seq_keep(ctx_dft, s_keep);
                llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
@@ -432,7 +439,7 @@ int main(int argc, char ** argv) {
            llama_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);

            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
-            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+            // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
            llama_decode(ctx_dft, batch_dft);

            ++n_past_dft;
@@ -479,7 +486,7 @@ int main(int argc, char ** argv) {
                const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);

                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
-                    LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
                            k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
                }

@@ -488,7 +495,7 @@ int main(int argc, char ** argv) {
                // attempt to split the branch if the probability is high enough
                for (int f = 1; f < 8; ++f) {
                    if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
-                        LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
+                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);

                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
                        llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
@@ -577,7 +584,7 @@ int main(int argc, char ** argv) {
                llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
            }

-            // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
+            // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
            llama_decode(ctx_tgt, batch_tgt);
            ++n_past_tgt;
        }
@@ -595,25 +602,23 @@ int main(int argc, char ** argv) {

    auto t_dec_end = ggml_time_us();

-    LOG("\n\n");
+    LOG_TEE("\n\n");

-    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));

-    LOG_INF("\n");
-    LOG_INF("n_draft   = %d\n", n_draft);
-    LOG_INF("n_predict = %d\n", n_predict);
-    LOG_INF("n_drafted = %d\n", n_drafted);
-    LOG_INF("n_accept  = %d\n", n_accept);
-    LOG_INF("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_TEE("\n");
+    LOG_TEE("n_draft   = %d\n", n_draft);
+    LOG_TEE("n_predict = %d\n", n_predict);
+    LOG_TEE("n_drafted = %d\n", n_drafted);
+    LOG_TEE("n_accept  = %d\n", n_accept);
+    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);

-    LOG_INF("\n");
-    LOG_INF("draft:\n\n");
+    LOG_TEE("\ndraft:\n\n");
    // TODO: print sampling/grammar timings for all drafts
-    llama_perf_context_print(ctx_dft);
+    llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);

-    LOG_INF("\n");
-    LOG_INF("target:\n\n");
+    LOG_TEE("\ntarget:\n\n");
    gpt_perf_print(ctx_tgt, smpl);

    gpt_sampler_free(smpl);
@@ -632,7 +637,7 @@ int main(int argc, char ** argv) {

    llama_backend_free();

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    return 0;
 }
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -4,23 +4,33 @@
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT

+INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 source /opt/intel/oneapi/setvars.sh

-#export GGML_SYCL_DEBUG=1
-
-#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
-
-INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
-MODEL_FILE=llama-2-7b.Q4_0.gguf
-NGL=33
-
 if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
+    GGML_SYCL_SINGLE_GPU=1
+else
+    GGML_SYCL_DEVICE=0
+    GGML_SYCL_SINGLE_GPU=0
+fi
+
+#export GGML_SYCL_DEBUG=1
+
+
+#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
+
+if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
    echo "use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -mg $GGML_SYCL_DEVICE -sm none
-
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
 else
    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
 fi
+
+#use main GPU only
+#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+
+#use multiple GPUs with same max compute units
+#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -1,13 +1,11 @@
 #include "common.h"
-//#include "log.h" // TODO: start using log.h
 #include "llama.h"

+#include <cmath>
 #include <cstdio>
-#include <cstring>
 #include <fstream>
 #include <string>
 #include <vector>
-#include <iostream> // TODO: remove me

 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@@ -15,25 +13,25 @@
 #include <shellapi.h>   // For CommandLineToArgvW
 #endif

-static void print_usage_information(const char * argv0) {
-    printf("usage: %s [options]\n\n", argv0);
-    printf("The tokenize program tokenizes a prompt using a given model,\n");
-    printf("and prints the resulting tokens to standard output.\n\n");
-    printf("It needs a model file, a prompt, and optionally other flags\n");
-    printf("to control the behavior of the tokenizer.\n\n");
-    printf("    The possible options are:\n");
-    printf("\n");
-    printf("    -h, --help                           print this help and exit\n");
-    printf("    -m MODEL_PATH, --model MODEL_PATH    path to model.\n");
-    printf("    --ids                                if given, only print numerical token IDs, and not token strings.\n");
-    printf("                                         The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
-    printf("    -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
-    printf("    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
-    printf("    --stdin                              read prompt from standard input.\n");
-    printf("    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
-    printf("    --no-parse-special                   do not parse control tokens.\n");
-    printf("    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
-    printf("    --show-count                         print the total number of tokens.\n");
+static void print_usage_information(const char * argv0, FILE * stream) {
+    fprintf(stream, "usage: %s [options]\n\n", argv0);
+    fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
+    fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
+    fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
+    fprintf(stream, "to control the behavior of the tokenizer.\n\n");
+    fprintf(stream, "    The possible options are:\n");
+    fprintf(stream, "\n");
+    fprintf(stream, "    -h, --help                           print this help and exit\n");
+    fprintf(stream, "    -m MODEL_PATH, --model MODEL_PATH    path to model.\n");
+    fprintf(stream, "    --ids                                if given, only print numerical token IDs, and not token strings.\n");
+    fprintf(stream, "                                         The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
+    fprintf(stream, "    -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
+    fprintf(stream, "    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
+    fprintf(stream, "    --stdin                              read prompt from standard input.\n");
+    fprintf(stream, "    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
+    fprintf(stream, "    --no-parse-special                   do not parse control tokens.\n");
+    fprintf(stream, "    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
+    fprintf(stream, "    --show-count                         print the total number of tokens.\n");
 }

 static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@@ -187,7 +185,7 @@ int main(int raw_argc, char ** raw_argv) {
    const int argc = argv.size();

    if (argc <= 1) {
-        print_usage_information(argv[0].c_str());
+        print_usage_information(argv[0].c_str(), stderr);
        return 1;
    }

@@ -216,7 +214,7 @@ int main(int raw_argc, char ** raw_argv) {
    for (; iarg < argc; ++iarg) {
        std::string arg{argv[iarg]};
        if (arg == "-h" || arg == "--help") {
-            print_usage_information(argv[0].c_str());
+            print_usage_information(argv[0].c_str(), stdout);
            return 0;
        }
        else if (arg == "--ids") {
@@ -325,6 +323,10 @@ int main(int raw_argc, char ** raw_argv) {
    // Start actually doing the tokenizing stuff.
    //////

+#ifdef LOG_DISABLE_LOGS
+    disable_logging = true;
+#endif
+
    if (disable_logging) {
        llama_log_set(llama_log_callback_null, NULL);
    }
--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1726153070,
-        "narHash": "sha256-HO4zgY0ekfwO5bX0QH/3kJ/h4KvUDFZg8YpkNwIbg1U=",
+        "lastModified": 1725024810,
+        "narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=",
        "owner": "hercules-ci",
        "repo": "flake-parts",
-        "rev": "bcef6817a8b2aa20a5a6dbb19b43e63c5bf8619a",
+        "rev": "af510d4a62d071ea13925ce41c95e3dec816c01d",
        "type": "github"
      },
      "original": {
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1726062873,
-        "narHash": "sha256-IiA3jfbR7K/B5+9byVi9BZGWTD4VSbWe8VLpp9B/iYk=",
+        "lastModified": 1724819573,
+        "narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "4f807e8940284ad7925ebd0a0993d2a1791acb2f",
+        "rev": "71e91c409d1e654808b2621f28a327acfdad8dc2",
        "type": "github"
      },
      "original": {
@@ -36,14 +36,14 @@
    },
    "nixpkgs-lib": {
      "locked": {
-        "lastModified": 1725233747,
-        "narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=",
+        "lastModified": 1722555339,
+        "narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=",
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
      },
      "original": {
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
      }
    },
    "root": {
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@@ -80,13 +80,6 @@ ggml_backend_cann_buffer_type(int32_t device);
 */
 GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);

-/**
- * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
- *
- * @return A pointer to the host buffer type interface.
- */
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
-
 /**
 * @brief Retrieves the description of a specific CANN device.
 *
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -564,11 +564,10 @@ extern "C" {
    };

    enum ggml_log_level {
-        GGML_LOG_LEVEL_NONE  = 0,
-        GGML_LOG_LEVEL_INFO  = 1,
-        GGML_LOG_LEVEL_WARN  = 2,
-        GGML_LOG_LEVEL_ERROR = 3,
-        GGML_LOG_LEVEL_DEBUG = 4,
+        GGML_LOG_LEVEL_ERROR = 2,
+        GGML_LOG_LEVEL_WARN  = 3,
+        GGML_LOG_LEVEL_INFO  = 4,
+        GGML_LOG_LEVEL_DEBUG = 5
    };

    enum ggml_tensor_flag {
@@ -645,20 +644,6 @@ extern "C" {

    typedef struct ggml_threadpool * ggml_threadpool_t;

-    // the compute plan that needs to be prepared for ggml_graph_compute()
-    // since https://github.com/ggerganov/ggml/issues/287
-    struct ggml_cplan {
-        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
-        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
-
-        int n_threads;
-        struct ggml_threadpool * threadpool;
-
-        // abort ggml_graph_compute when true
-        ggml_abort_callback abort_callback;
-        void *              abort_callback_data;
-    };
-
    // scratch buffer
    struct ggml_scratch {
        size_t offs;
@@ -2048,7 +2033,6 @@ extern "C" {
    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);

-    // graph allocation in a context
    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
@@ -2066,26 +2050,72 @@ extern "C" {
    GGML_API size_t ggml_graph_overhead(void);
    GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);

+    // TODO: move these declarations above before the ggml_graph API and reorder the implementation order in ggml.c
+    //       (unless the code has been moved to a separate source file)
    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
-    GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
-    GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
-    GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
-    GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
-    GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
+    GGML_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params * params);
+    GGML_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
+    GGML_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_pause         (struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_resume        (struct ggml_threadpool * threadpool);

-    // ggml_graph_plan() has to be called before ggml_graph_compute()
-    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan(
-                  const struct ggml_cgraph * cgraph,
-                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
-                    struct ggml_threadpool * threadpool /* = NULL */ );
-    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    // =================================================================================================
+    // CPU-only API for ggml_cgraph
+    //
+    // TODO: move to the CPU backend
+    // NOTE: avoid using, will be removed
+    //

-    // same as ggml_graph_compute() but the work data is allocated as a part of the context
-    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+    // loops through the graph and determines:
+    //
+    // - work size needed for CPU computation
+    // - number of threads to start
+    //
+    GGML_API enum ggml_status ggml_graph_prepare(
+                struct ggml_cgraph * cgraph,
+                               int   n_threads, /* = GGML_DEFAULT_N_THREADS */
+            struct ggml_threadpool * threadpool /* = NULL */ );
+
+    // get the estimated work size for the graph from ggml_graph_prepare()
+    GGML_API size_t ggml_graph_work_size(const struct ggml_cgraph * cgraph);
+
+    // if ctx is NULL, the work buffer will be dynamically allocated. in this case, call ggml_graph_work_free() to free the buffer
+    // otherwise, the work buffer will be allocated in the context. no need to free it
+    GGML_API enum ggml_status ggml_graph_work_init(struct ggml_cgraph * cgraph, struct ggml_context * ctx);
+    GGML_API void             ggml_graph_work_free(struct ggml_cgraph * cgraph);
+
+    // note: call ggml_graph_prepare() and ggml_graph_work_init() first
+    //
+    // sample usages:
+    //
+    //   - no dynamic allocations:
+    //
+    //      ... prepare ggml_context ctx ...
+    //
+    //      ggml_graph_prepare  (cgraph, n_threads, threadpool);
+    //      ggml_graph_work_init(cgraph, ctx);
+    //
+    //      ggml_graph_compute  (cgraph); // can call many times
+    //
+    //      // no need to call ggml_graph_work_free() because it is allocated in ctx
+    //
+    //  - dynamic allocations:
+    //
+    //      ggml_graph_prepare  (cgraph, n_threads, threadpool);
+    //      ggml_graph_work_init(cgraph, NULL); // will allocate memory
+    //
+    //      ggml_graph_compute  (cgraph); // can call many times
+    //
+    //      ggml_graph_work_free(cgraph);
+    //
+    GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph);
+
+    // end of CPU-only API
+    // =================================================================================================
+
+    GGML_API void ggml_graph_set_abort_callback(struct ggml_cgraph * cgraph, ggml_abort_callback abort_callback, void * abort_data);

    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);

@@ -2108,6 +2138,7 @@ extern "C" {
            struct ggml_cgraph    * gb_tmp,
            struct ggml_tensor  * * checkpoints,
            int                     n_checkpoints);
+
    //
    // optimization
    //
@@ -2471,7 +2502,6 @@ extern "C" {
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
-    GGML_API int ggml_cpu_has_riscv_v    (void);
    GGML_API int ggml_cpu_has_sycl       (void);
    GGML_API int ggml_cpu_has_rpc        (void);
    GGML_API int ggml_cpu_has_vsx        (void);
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -26,9 +26,6 @@ if (NOT MSVC)
    endif()
 endif()

-unset(GGML_EXTRA_LIBS_PRIVATE)
-unset(GGML_EXTRA_LIBS_PUBLIC)
-
 if (APPLE AND GGML_ACCELERATE)
    find_library(ACCELERATE_FRAMEWORK Accelerate)
    if (ACCELERATE_FRAMEWORK)
@@ -38,7 +35,7 @@ if (APPLE AND GGML_ACCELERATE)
        add_compile_definitions(ACCELERATE_NEW_LAPACK)
        add_compile_definitions(ACCELERATE_LAPACK_ILP64)

-        list(APPEND GGML_EXTRA_LIBS_PRIVATE ${ACCELERATE_FRAMEWORK})
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
    else()
        message(WARNING "Accelerate framework not found")
    endif()
@@ -90,7 +87,7 @@ if (GGML_METAL)
            COMMENT "Generate assembly for embedded Metal library"
        )

-        list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM})
+        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM})
    else()
        if (GGML_METAL_SHADER_DEBUG)
            # custom command to do the following:
@@ -135,7 +132,7 @@ if (GGML_METAL)
            )
    endif() # GGML_METAL_EMBED_LIBRARY

-    list(APPEND GGML_EXTRA_LIBS_PRIVATE
+    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS}
        ${FOUNDATION_LIBRARY}
        ${METAL_FRAMEWORK}
        ${METALKIT_FRAMEWORK}
@@ -160,11 +157,11 @@ if (GGML_OPENMP)

        add_compile_definitions(GGML_USE_OPENMP)

-        list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)

        if (GGML_MUSA)
-            list(APPEND GGML_EXTRA_INCLUDES     "/usr/lib/llvm-10/include/openmp")
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so")
+            set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} "/usr/lib/llvm-10/include/openmp")
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} "/usr/lib/llvm-10/lib/libomp.so")
        endif()
    else()
        message(WARNING "OpenMP not found")
@@ -247,8 +244,8 @@ if (GGML_BLAS)
        set(GGML_HEADERS_BLAS ../include/ggml-blas.h)
        set(GGML_SOURCES_BLAS ggml-blas.cpp)

-        list(APPEND GGML_EXTRA_LIBS_PRIVATE ${BLAS_LIBRARIES})
-        list(APPEND GGML_EXTRA_INCLUDES     ${BLAS_INCLUDE_DIRS})
+        set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     ${BLAS_LIBRARIES})
+        set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
    else()
        message(WARNING "BLAS not found, please refer to "
        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@@ -371,19 +368,19 @@ if (GGML_CUDA)
        if (GGML_STATIC)
            if (WIN32)
                # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
            else ()
                if (GGML_MUSA)
-                    list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart_static MUSA::mublas_static)
+                    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart_static MUSA::mublas_static)
                else()
-                    list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+                    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
                endif()
            endif()
        else()
            if (GGML_MUSA)
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart MUSA::mublas)
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart MUSA::mublas)
            else()
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
            endif()
        endif()

@@ -391,9 +388,9 @@ if (GGML_CUDA)
            # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
        else()
            if (GGML_MUSA)
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
            else()
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
            endif()
        endif()
    else()
@@ -498,7 +495,7 @@ if (GGML_HIPBLAS)

    if (CXX_IS_HIPCC)
        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE hip::device)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} hip::device)
    else()
        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
    endif()
@@ -507,7 +504,7 @@ if (GGML_HIPBLAS)
        message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
    endif()

-    list(APPEND GGML_EXTRA_LIBS_PUBLIC hip::host roc::rocblas roc::hipblas)
+    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
 endif()

 if (GGML_SYCL)
@@ -516,8 +513,7 @@ if (GGML_SYCL)
    endif()

    check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
-
-    if (DEFINED ENV{ONEAPI_ROOT})
+    if ( DEFINED ENV{ONEAPI_ROOT})
        message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
    elseif(SUPPORTS_SYCL)
        message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
@@ -555,29 +551,26 @@ if (GGML_SYCL)

    find_package(DNNL)
    message("-- DNNL found:" ${DNNL_FOUND})
-
    if (GGML_SYCL_TARGET STREQUAL "INTEL")
        add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
    else()
        add_compile_definitions(GGML_SYCL_DNNL=0)
    endif()
-
-    if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl)
-    endif()
-
    if (WIN32)
        find_package(IntelSYCL REQUIRED)
        find_package(MKL REQUIRED)
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
    else()
        if (GGML_SYCL_TARGET STREQUAL "INTEL")
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
        elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
        endif()
    endif()
+    if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
+        list(APPEND GGML_EXTRA_LIBS DNNL::dnnl)
+    endif()
 endif()

 if (GGML_RPC)
@@ -586,7 +579,7 @@ if (GGML_RPC)
    list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC)

    if (WIN32)
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE ws2_32)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ws2_32)
    endif()

    set(GGML_HEADERS_RPC ../include/ggml-rpc.h)
@@ -664,8 +657,8 @@ if (GGML_VULKAN)
        set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header})
        set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source})

-        list(APPEND GGML_EXTRA_LIBS_PRIVATE Vulkan::Vulkan)
-        list(APPEND GGML_EXTRA_INCLUDES     ${CMAKE_CURRENT_BINARY_DIR})
+        set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS} Vulkan::Vulkan)
+        set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
    else()
        message(WARNING "Vulkan not found")
    endif()
@@ -824,8 +817,8 @@ if (GGML_KOMPUTE)

        list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE)

-        list(APPEND GGML_EXTRA_LIBS_PRIVATE kompute)
-        list(APPEND GGML_EXTRA_INCLUDES     ${CMAKE_CURRENT_BINARY_DIR})
+        set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     kompute)
+        set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
    else()
        message(WARNING "Kompute not found")
    endif()
@@ -890,10 +883,9 @@ if (GGML_CANN)
            message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
            message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")

-            list(APPEND GGML_EXTRA_LIBS_PRIVATE ${CANN_LIBRARIES} )
-            list(APPEND GGML_EXTRA_INCLUDES     ${CANN_INCLUDE_DIRS})
-            list(APPEND GGML_EXTRA_LIBDIRS      ${CANN_INSTALL_DIR}/lib64)
-
+            set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     ${CANN_LIBRARIES} )
+            set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
+            set(GGML_EXTRA_LIBDIRS  ${GGML_EXTRA_LIBDIRS}  ${CANN_INSTALL_DIR}/lib64)
            list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
        endif()
    else()
@@ -1330,13 +1322,13 @@ if (EMSCRIPTEN)
    set_target_properties(ggml PROPERTIES COMPILE_FLAGS "-msimd128")
 endif()

-target_compile_definitions(ggml PUBLIC    ${GGML_CDEF_PUBLIC})
-target_include_directories(ggml PUBLIC  ../include)
+target_compile_definitions(ggml PUBLIC  ${GGML_CDEF_PUBLIC})
+target_include_directories(ggml PUBLIC ../include)
 target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
-target_link_directories   (ggml PRIVATE   ${GGML_EXTRA_LIBDIRS})
+target_link_directories(ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
 target_compile_features   (ggml PRIVATE c_std_11) # don't bump

-list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads)
+target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})

 find_library(MATH_LIBRARY m)
 if (MATH_LIBRARY)
@@ -1345,10 +1337,6 @@ if (MATH_LIBRARY)
    endif()
 endif()

-list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE)
-list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC)
-target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC})
-
 if (BUILD_SHARED_LIBS)
    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD)
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -752,7 +752,8 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_
 }

 struct ggml_backend_plan_cpu {
-    struct ggml_cplan cplan;
+    // TODO: move member from ggml_cgraph here when the public CPU-only API is removed
+
    struct ggml_cgraph cgraph;
 };

@@ -761,19 +762,19 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg

    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));

-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
+    ggml_graph_prepare(&cpu_plan->cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);

-    if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
-        if (cpu_plan->cplan.work_data == NULL) {
+    if (cpu_plan->cgraph.work_size > 0) {
+        cpu_plan->cgraph.work_data = malloc(cpu_plan->cgraph.work_size);
+        if (cpu_plan->cgraph.work_data == NULL) {
            free(cpu_plan);
            return NULL;
        }
    }

-    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
-    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+    cpu_plan->cgraph.abort_callback      = cpu_ctx->abort_callback;
+    cpu_plan->cgraph.abort_callback_data = cpu_ctx->abort_callback_data;

    return cpu_plan;
 }
@@ -781,7 +782,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
 GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;

-    free(cpu_plan->cplan.work_data);
+    free(cpu_plan->cgraph.work_data);
    free(cpu_plan);

    GGML_UNUSED(backend);
@@ -790,7 +791,7 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;

-    return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
+    return ggml_graph_compute(&cpu_plan->cgraph);

    GGML_UNUSED(backend);
 }
@@ -798,23 +799,24 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;

-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
+    ggml_graph_prepare(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);

-    if (cpu_ctx->work_size < cplan.work_size) {
+    if (cpu_ctx->work_size < cgraph->work_size) {
        free(cpu_ctx->work_data);
-        cpu_ctx->work_data = malloc(cplan.work_size);
+        cpu_ctx->work_data = malloc(cgraph->work_size);
        if (cpu_ctx->work_data == NULL) {
            cpu_ctx->work_size = 0;
            return GGML_STATUS_ALLOC_FAILED;
        }
-        cpu_ctx->work_size = cplan.work_size;
+        cpu_ctx->work_size = cgraph->work_size;
    }
-    cplan.work_data = cpu_ctx->work_data;
+    cgraph->work_data = cpu_ctx->work_data;
+    cgraph->work_own  = false; // always freed by ggml_backend_cpu_graph_plan_free

-    cplan.abort_callback      = cpu_ctx->abort_callback;
-    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+    cgraph->abort_callback      = cpu_ctx->abort_callback;
+    cgraph->abort_callback_data = cpu_ctx->abort_callback_data;

-    return ggml_graph_compute(cgraph, &cplan);
+    return ggml_graph_compute(cgraph);
 }

 GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -1221,116 +1221,6 @@ ggml_backend_cann_buffer_type(int32_t device) {
    return &ggml_backend_cann_buffer_types[device];
 }

-/**
- * @brief Retrieves the name associated with a CANN host buffer type.
- *
- * This function returns the descriptive name associated with the specified
- * CANN host buffer type context.
- *
- * @param buft Pointer to the host buffer type context.
- * @return Const pointer to the C-style string containing the name.
- */
-GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return "CANN_Host";
-
-    GGML_UNUSED(buft);
-}
-
-/**
- * @brief Retrieves the name associated with a CANN host buffer.
- *
- * This function returns the descriptive name associated with the specified
- * CANN host buffer context.
- *
- * @param buft Pointer to the host buffer context.
- * @return Const pointer to the C-style string containing the name.
- */
-GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
-    return "CANN_Host";
-
-    GGML_UNUSED(buffer);
-}
-
-/**
- * @brief Free resources associated with a CANN host buffer.
- *
- * This function frees the resources associated with a CANN host buffer, including
- * its context.
- *
- * @param buffer The CANN host buffer to free.
- */
-GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
-    ACL_CHECK(aclrtFreeHost(buffer->context));
-}
-
-/**
- * @brief Allocates a new CANN host buffer of the specified size.
- *
- * This function allocates a new CANN host buffer with the given size.
- * @param size Size in bytes of the host buffer to allocate.
- * @return Pointer to the allocated host buffer, or nullptr if allocation fails.
- */
-static void * ggml_cann_host_malloc(size_t size) {
-    if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
-        return nullptr;
-    }
-
-    void * hostPtr = nullptr;
-    aclError err = aclrtMallocHost((void **) &hostPtr, size);
-    if (err != ACL_SUCCESS) {
-
-        GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
-                           size / 1024.0 / 1024.0, aclGetRecentErrMsg());
-        return nullptr;
-    }
-    return hostPtr;
-}
-
-/**
- * @brief Allocates a new CANN host buffer of the specified type and size.
- *
- * @param buft Pointer to the host buffer type context.
- * @param size Size in bytes of the host buffer to allocate.
- * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
- */
-GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * hostPtr = ggml_cann_host_malloc(size);
-
-    if (hostPtr == nullptr) {
-        // fallback to cpu buffer
-        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
-    buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
-    buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
-
-    return buffer;
-}
-
-/**
- * @brief Interface for managing CANN host buffer types in the GGML backend.
- *
- * Provides function pointers for allocating, querying properties, and managing
- * memory for CANN buffer types in the GGML backend.
- */
-GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_cann_host_buffer_type_name,
-            /* .alloc_buffer     = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .context  = */ nullptr,
-    };
-
-    return &ggml_backend_cann_buffer_type_host;
-}
-
 /**
 * @brief Computes the forward operation for a given tensor using CANN
 * operations.
@@ -2053,7 +1943,7 @@ GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
        GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
        return nullptr;
    }
-    ggml_cann_set_device(ctx->device);
+
    ggml_backend_t cann_backend =
        new ggml_backend{/* .guid      = */ ggml_backend_cann_guid(),
                         /* .interface = */ ggml_backend_cann_interface,
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -26,11 +26,7 @@ void ggml_cuda_op_mul_mat_q(
    // nrows_dst == nrows of the matrix that the kernel writes into
    const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;

-    // The stream-k decomposition is only faster for recent NVIDIA GPUs.
-    // Also its fixup needs to allocate a temporary buffer in the memory pool.
-    // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
-    const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
-    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
+    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -2742,7 +2742,6 @@ struct mmq_args {
    int64_t ne00; int64_t ne01; int64_t stride01;
    int64_t ne10; int64_t ne11; int64_t stride11;
    int64_t ne0;
-    bool use_stream_k;
 };

 template<ggml_type type>
@@ -2778,7 +2777,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
    const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
    const dim3 block_nums_xy_tiling(nty, ntx, 1);

-    if (!args.use_stream_k) {
+    const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
+    if (!use_stream_k) {
        if (args.ne01 % mmq_y == 0) {
            constexpr bool need_check = false;
            mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -130,3 +130,42 @@
 #define cudaKernelNodeParams musaKernelNodeParams
 #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
 #define cudaStreamEndCapture musaStreamEndCapture
+
+// XXX: Clang builtins mapping
+#define __vsub4   __vsub4_musa
+#define __vcmpeq4 __vcmpeq4_musa
+#define __vcmpne4 __vcmpne4_musa
+
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+
+typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
+
+static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
+    return __vsubss4(a, b);
+}
+
+static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+    unsigned int c;
+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
+    }
+    return c;
+}
+
+static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+    unsigned int c;
+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
+    }
+    return c;
+}
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -773,6 +773,19 @@ struct ggml_cgraph {
    struct ggml_hash_set visited_hash_set;

    enum ggml_cgraph_eval_order order;
+
+    // TODO: after the CPU-only API is removed, we can move the members below to ggml_backend_plan_cpu
+
+    bool      work_own;
+    size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+    uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+    int n_threads;
+    struct ggml_threadpool * threadpool;
+
+    // abort ggml_graph_compute when true
+    ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
 };

 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -13,16 +13,13 @@
 #define MAX(a, b) ((a) > (b) ? (a) : (b))

 #ifdef GGML_METAL_NDEBUG
-#define GGML_METAL_LOG(...)
 #define GGML_METAL_LOG_INFO(...)
 #define GGML_METAL_LOG_WARN(...)
 #define GGML_METAL_LOG_ERROR(...)
 #else
-#define GGML_METAL_LOG(...)       ggml_metal_log(GGML_LOG_LEVEL_NONE,  __VA_ARGS__)
 #define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
 #define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
 #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
-#define GGML_METAL_LOG_DEBUG(...) ggml_metal_log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #endif

 #define UNUSED(x) (void)(x)
@@ -3186,7 +3183,7 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
 #ifndef GGML_METAL_NDEBUG
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
    if (@available(macOS 10.12, iOS 16.0, *)) {
-        GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
+        GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)",
                __func__,
                size_aligned / 1024.0 / 1024.0,
                device.currentAllocatedSize / 1024.0 / 1024.0,
@@ -3194,6 +3191,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s

        if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
            GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
+        } else {
+            GGML_METAL_LOG_INFO("\n");
        }
    } else {
        GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
@@ -3225,19 +3224,15 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
    ctx->n_buffers = 1;

    if (ctx->all_data != NULL) {
-        ctx->buffers[0].data  = ctx->all_data;
-        ctx->buffers[0].size  = size;
-        ctx->buffers[0].metal = nil;
-
-        if (size_aligned > 0) {
-            ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
-                            length:size_aligned
-                            options:MTLResourceStorageModeShared
-                            deallocator:nil];
-        }
+        ctx->buffers[0].data = ctx->all_data;
+        ctx->buffers[0].size = size;
+        ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
+                        length:size_aligned
+                        options:MTLResourceStorageModeShared
+                        deallocator:nil];
    }

-    if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
+    if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) {
        GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
        free(ctx);
        ggml_backend_metal_free_device();
@@ -3314,17 +3309,14 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,

    // the buffer fits into the max buffer size allowed by the device
    if (size_aligned <= device.maxBufferLength) {
-        ctx->buffers[ctx->n_buffers].data  = data;
-        ctx->buffers[ctx->n_buffers].size  = size;
-        ctx->buffers[ctx->n_buffers].metal = nil;
+        ctx->buffers[ctx->n_buffers].data = data;
+        ctx->buffers[ctx->n_buffers].size = size;

-        if (size_aligned > 0) {
-            ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
+        ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];

-            if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
-                return false;
-            }
+        if (ctx->buffers[ctx->n_buffers].metal == nil) {
+            GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
+            return false;
        }

        ggml_backend_metal_log_allocated_size(device, size_aligned);
@@ -3340,17 +3332,14 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
        for (size_t i = 0; i < size; i += size_step) {
            const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);

-            ctx->buffers[ctx->n_buffers].data  = (void *) ((uint8_t *) data + i);
-            ctx->buffers[ctx->n_buffers].size  = size_step_aligned;
-            ctx->buffers[ctx->n_buffers].metal = nil;
+            ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
+            ctx->buffers[ctx->n_buffers].size = size_step_aligned;

-            if (size_step_aligned > 0) {
-                ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
+            ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];

-                if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                    GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
-                    return false;
-                }
+            if (ctx->buffers[ctx->n_buffers].metal == nil) {
+                GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
+                return false;
            }

            ggml_backend_metal_log_allocated_size(device, size_step_aligned);
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -5137,17 +5137,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_CLAMP:
-            return true;
        case GGML_OP_CONT:
-            return op->src[0]->type != GGML_TYPE_BF16;
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
            return true;
        case GGML_OP_ROPE:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_IM2COL:
-            // TODO: add support for the new F32 operations
-            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -2001,7 +2001,6 @@ struct ggml_threadpool {
    ggml_cond_t  cond;        // cond.var for waiting for new work

    struct ggml_cgraph * cgraph;
-    struct ggml_cplan  * cplan;

    // synchronization primitives
    atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
@@ -3399,7 +3398,7 @@ double ggml_type_sizef(enum ggml_type type) {
 }

 GGML_CALL const char * ggml_type_name(enum ggml_type type) {
-    return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
+    return type_traits[type].type_name;
 }

 GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
@@ -19089,14 +19088,21 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
    assert(obj_size == (size_t)((char *)p - (char *)cgraph));

    *cgraph = (struct ggml_cgraph) {
-        /*.size         =*/ size,
-        /*.n_nodes      =*/ 0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ nodes_ptr,
-        /*.grads        =*/ grads_ptr,
-        /*.leafs        =*/ leafs_ptr,
-        /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
-        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
+        /*.size                =*/ size,
+        /*.n_nodes             =*/ 0,
+        /*.n_leafs             =*/ 0,
+        /*.nodes               =*/ nodes_ptr,
+        /*.grads               =*/ grads_ptr,
+        /*.leafs               =*/ leafs_ptr,
+        /*.visited_hash_set    =*/ { hash_size, hash_used, hash_keys_ptr },
+        /*.order               =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
+        /*.work_own            =*/ false,
+        /*.work_size           =*/ 0,
+        /*.work_data           =*/ NULL,
+        /*.n_threads           =*/ GGML_DEFAULT_N_THREADS,
+        /*.threadpool          =*/ NULL,
+        /*.abort_callback      =*/ NULL,
+        /*.abort_callback_data =*/ NULL,
    };

    ggml_hash_set_reset(&cgraph->visited_hash_set);
@@ -19110,14 +19116,21 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {

 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
    struct ggml_cgraph cgraph = {
-        /*.size         =*/ 0,
-        /*.n_nodes      =*/ i1 - i0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ cgraph0->nodes + i0,
-        /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
-        /*.leafs        =*/ NULL,
-        /*.hash_table   =*/ { 0, NULL, NULL },
-        /*.order        =*/ cgraph0->order,
+        /*.size                =*/ 0,
+        /*.n_nodes             =*/ i1 - i0,
+        /*.n_leafs             =*/ 0,
+        /*.nodes               =*/ cgraph0->nodes + i0,
+        /*.grads               =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
+        /*.leafs               =*/ NULL,
+        /*.hash_table          =*/ { 0, NULL, NULL },
+        /*.order               =*/ cgraph0->order,
+        /*.work_own            =*/ false,
+        /*.work_size           =*/ 0,
+        /*.work_data           =*/ NULL,
+        /*.n_threads           =*/ GGML_DEFAULT_N_THREADS,
+        /*.threadpool          =*/ NULL,
+        /*.abort_callback      =*/ NULL,
+        /*.abort_callback_data =*/ NULL,
    };

    return cgraph;
@@ -19202,7 +19215,6 @@ int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
 }

 void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
-    GGML_ASSERT(cgraph->size > cgraph->n_nodes);
    cgraph->nodes[cgraph->n_nodes] = tensor;
    cgraph->n_nodes++;
 }
@@ -19754,11 +19766,10 @@ void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
 #endif
 }

-struct ggml_cplan ggml_graph_plan(
-          const struct ggml_cgraph * cgraph,
-                           int       n_threads,
-    struct ggml_threadpool * threadpool) {
-
+enum ggml_status ggml_graph_prepare(
+            struct ggml_cgraph * cgraph,
+                           int   n_threads,
+        struct ggml_threadpool * threadpool) {
    if (threadpool == NULL) {
        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
    }
@@ -19768,9 +19779,6 @@ struct ggml_cplan ggml_graph_plan(

    size_t work_size = 0;

-    struct ggml_cplan cplan;
-    memset(&cplan, 0, sizeof(struct ggml_cplan));
-
    int max_tasks = 1;

    // thread scheduling for the different operations + work buffer size estimation
@@ -19922,28 +19930,63 @@ struct ggml_cplan ggml_graph_plan(
        work_size += CACHE_LINE_SIZE*(n_threads);
    }

-    cplan.threadpool = threadpool;
-    cplan.n_threads  = MIN(max_tasks, n_threads);
-    cplan.work_size  = work_size;
-    cplan.work_data  = NULL;
+    cgraph->threadpool = threadpool;
+    cgraph->n_threads  = MIN(max_tasks, n_threads);
+    cgraph->work_size  = work_size;

-    return cplan;
+    ggml_graph_work_free(cgraph);
+
+    return GGML_STATUS_SUCCESS;
+}
+
+size_t ggml_graph_work_size(const struct ggml_cgraph * cgraph) {
+    return cgraph->work_size;
+}
+
+enum ggml_status ggml_graph_work_init(struct ggml_cgraph * cgraph, struct ggml_context * ctx) {
+    GGML_ASSERT(cgraph->n_threads > 0 && "call ggml_graph_prepare first");
+
+    ggml_graph_work_free(cgraph);
+
+    if (cgraph->work_size > 0) {
+        if (ctx == NULL) {
+            cgraph->work_data = GGML_ALIGNED_MALLOC(cgraph->work_size);
+            if (cgraph->work_data == NULL) {
+                return GGML_STATUS_ALLOC_FAILED;
+            }
+
+            cgraph->work_own = true;
+        } else {
+            struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cgraph->work_size);
+
+            cgraph->work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+            cgraph->work_own  = false;
+        }
+    }
+
+    return GGML_STATUS_SUCCESS;
+}
+
+void ggml_graph_work_free(struct ggml_cgraph * cgraph) {
+    if (cgraph->work_data && cgraph->work_own) {
+        GGML_ALIGNED_FREE(cgraph->work_data);
+        cgraph->work_data = NULL;
+    }
 }

 static thread_ret_t ggml_graph_compute_thread(void * data) {
    struct ggml_compute_state * state = (struct ggml_compute_state *) data;

    const struct ggml_cgraph * cgraph = state->threadpool->cgraph;
-    const struct ggml_cplan  * cplan  = state->threadpool->cplan;

    set_numa_thread_affinity(state->ith);

    struct ggml_compute_params params = {
-        /*.ith       =*/ state->ith,
-        /*.nth       =*/ state->threadpool->n_threads_cur,
-        /*.wsize     =*/ cplan->work_size,
-        /*.wdata     =*/ cplan->work_data,
-        /*.threadpool=*/ state->threadpool,
+        /*.ith        =*/ state->ith,
+        /*.nth        =*/ state->threadpool->n_threads_cur,
+        /*.wsize      =*/ cgraph->work_size,
+        /*.wdata      =*/ cgraph->work_data,
+        /*.threadpool =*/ state->threadpool,
    };

    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
@@ -19951,7 +19994,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

        ggml_compute_forward(&params, node);

-        if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+        if (state->ith == 0 && cgraph->abort_callback && cgraph->abort_callback(cgraph->abort_callback_data)) {
            state->threadpool->ec = GGML_STATUS_ABORTED;
        }

@@ -20105,14 +20148,12 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons

 static struct ggml_threadpool * ggml_threadpool_new_impl(
    struct ggml_threadpool_params * tpp,
-               struct ggml_cgraph * cgraph,
-                struct ggml_cplan * cplan) {
+               struct ggml_cgraph * cgraph) {

    struct ggml_threadpool * threadpool =
        GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
    {
        threadpool->cgraph           = cgraph;
-        threadpool->cplan            = cplan;
        threadpool->n_graph          = 0;
        threadpool->n_barrier        = 0;
        threadpool->n_barrier_passed = 0;
@@ -20170,16 +20211,15 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
 }

 struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) {
-    return ggml_threadpool_new_impl(tpp, NULL, NULL);
+    return ggml_threadpool_new_impl(tpp, NULL);
 }

-enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
-    GGML_ASSERT(cplan);
-    GGML_ASSERT(cplan->n_threads > 0);
-    GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
+enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph) {
+    GGML_ASSERT((cgraph->n_threads > 0                              ) && "call ggml_graph_prepare first");
+    GGML_ASSERT((cgraph->work_size == 0 || cgraph->work_data != NULL) && "call ggml_graph_work_init first");

-    int n_threads                               = cplan->n_threads;
-    struct ggml_threadpool * threadpool = cplan->threadpool;
+    int n_threads = cgraph->n_threads;
+    struct ggml_threadpool * threadpool = cgraph->threadpool;

    bool disposable_threadpool = false;

@@ -20188,19 +20228,18 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
        disposable_threadpool = true;

        struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
-        threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan);
+        threadpool = ggml_threadpool_new_impl(&ttp, cgraph);
    } else {
        // Reset some of the parameters that need resetting
        // No worker threads should be accessing the parameters below at this stage
-        threadpool->cgraph           = cgraph;
-        threadpool->cplan            = cplan;
-        threadpool->n_threads_cur    = n_threads;
-        threadpool->current_chunk    = 0;
-        threadpool->ec               = GGML_STATUS_SUCCESS;
+        threadpool->cgraph        = cgraph;
+        threadpool->n_threads_cur = n_threads;
+        threadpool->current_chunk = 0;
+        threadpool->ec            = GGML_STATUS_SUCCESS;
    }

    if (n_threads > threadpool->n_threads_max) {
-        GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
+        GGML_PRINT("WARNING: cgraph is requesting more threads than the threadpool contains. Expect a bad time!\n");
    }

 #ifdef GGML_USE_OPENMP
@@ -20239,14 +20278,9 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
    return ret;
 }

-enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
-
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-
-    return ggml_graph_compute(cgraph, &cplan);
+void ggml_graph_set_abort_callback(struct ggml_cgraph * cgraph, ggml_abort_callback abort_callback, void * abort_data) {
+    cgraph->abort_callback = abort_callback;
+    cgraph->abort_callback_data = abort_data;
 }

 struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
@@ -21056,9 +21090,8 @@ static enum ggml_opt_result ggml_opt_adam(

    float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values

-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+    ggml_graph_prepare  (gb, params.n_threads, NULL);
+    ggml_graph_work_init(gb, ctx);

    bool cancel = false;

@@ -21074,7 +21107,7 @@ static enum ggml_opt_result ggml_opt_adam(
        }
        // ggml_graph_reset  (gf);
        ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(gb, &cplan);
+        ggml_graph_compute(gb);
        ggml_opt_acc_grad(np, ps, g, accum_norm);
        fx += ggml_get_f32_1d(f, 0);
    }
@@ -21165,7 +21198,7 @@ static enum ggml_opt_result ggml_opt_adam(
            }
            // ggml_graph_reset  (gf);
            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(gb, &cplan);
+            ggml_graph_compute(gb);
            ggml_opt_acc_grad(np, ps, g, accum_norm);
            fx += ggml_get_f32_1d(f, 0);
        }
@@ -21250,7 +21283,6 @@ static enum ggml_opt_result linesearch_backtracking(
        const float * xp,
        struct ggml_tensor * f,
        struct ggml_cgraph * gb,
-        struct ggml_cplan  * cplan,
        const int np,
        struct ggml_tensor * ps[],
        bool * cancel,
@@ -21307,7 +21339,7 @@ static enum ggml_opt_result linesearch_backtracking(
                }
                // ggml_graph_reset  (gf);
                ggml_set_f32      (f->grad, 1.0f);
-                ggml_graph_compute(gb, cplan);
+                ggml_graph_compute(gb);
                ggml_opt_acc_grad(np, ps, g, accum_norm);
                *fx += ggml_get_f32_1d(f, 0);
            }
@@ -21403,9 +21435,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
        opt->iter = iter;
    }

-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+    ggml_graph_prepare  (gb, params.n_threads, NULL);
+    ggml_graph_work_init(gb, ctx);

    float * x  = opt->lbfgs.x->data;  // current parameters
    float * xp = opt->lbfgs.xp->data; // previous parameters
@@ -21450,7 +21481,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
            }
            // ggml_graph_reset  (gf);
            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(gb, &cplan);
+            ggml_graph_compute(gb);
            ggml_opt_acc_grad(np, ps, g, accum_norm);
            fx += ggml_get_f32_1d(f, 0);
        }
@@ -21516,7 +21547,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
        //       to determine if the optimization should be cancelled
        //       this is a simple change, but not doing this atm, since I don't have a nice
        //       way to test and don't want to break something with so many changes lined up
-        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
+        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, np, ps, &cancel, callback, callback_data);
        if (cancel) {
            return GGML_OPT_RESULT_CANCEL;
        }
@@ -23288,14 +23319,6 @@ int ggml_cpu_has_arm_fma(void) {
 #endif
 }

-int ggml_cpu_has_riscv_v(void) {
-#if defined(__riscv_v_intrinsic)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
 int ggml_cpu_has_metal(void) {
 #if defined(GGML_USE_METAL)
    return 1;
--- a/include/llama.h
+++ b/include/llama.h
@@ -343,7 +343,7 @@ extern "C" {
        bool embeddings;  // if true, extract embeddings (together with logits)
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-        bool no_perf;     // whether to measure performance timings
+      //bool no_perf;     // whether to measure performance timings, TODO: implement

        // Abort callback
        // if it returns true, execution of llama_decode() will be aborted
@@ -1056,9 +1056,6 @@ extern "C" {
    LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
    LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);

-    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
-    LLAMA_API struct llama_sampler * llama_sampler_chain_remove(   struct llama_sampler * chain, int32_t i);
-
    // available samplers:

    LLAMA_API struct llama_sampler * llama_sampler_init_greedy     (void);
@@ -1130,10 +1127,6 @@ extern "C" {
                             int32_t   n_logit_bias,
              const llama_logit_bias * logit_bias);

-
-    // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
-    LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
-
    /// @details Sample and accept a token from the idx-th output of the last evaluation
    //
    // Shorthand for:
@@ -1176,30 +1169,13 @@ extern "C" {
    // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
    //

-    struct llama_perf_context_data {
-        double t_start_ms;
-        double t_load_ms;
-        double t_p_eval_ms;
-        double t_eval_ms;
-
-        int32_t n_p_eval;
-        int32_t n_eval;
+    enum llama_perf_type {
+        LLAMA_PERF_TYPE_CONTEXT       = 0,
+        LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
    };

-    struct llama_perf_sampler_data {
-        double t_sample_ms;
-
-        int32_t n_sample;
-    };
-
-    LLAMA_API struct llama_perf_context_data llama_perf_context      (const struct llama_context * ctx);
-    LLAMA_API void                           llama_perf_context_print(const struct llama_context * ctx);
-    LLAMA_API void                           llama_perf_context_reset(      struct llama_context * ctx);
-
-    // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
-    LLAMA_API struct llama_perf_sampler_data llama_perf_sampler      (const struct llama_sampler * chain);
-    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
-    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
+    LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
+    LLAMA_API void llama_perf_reset(      void * ctx, enum llama_perf_type type);

    LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);

--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -24,7 +24,6 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3)
 void llama_log_internal        (ggml_log_level level, const char * format, ...);
 void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);

-#define LLAMA_LOG(...)       llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
 #define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
 #define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -8,7 +8,6 @@
 #include <cstring>
 #include <ctime>
 #include <cfloat>
-#include <chrono>
 #include <cmath>
 #include <numeric>
 #include <random>
@@ -163,19 +162,6 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
    cur_p->size = k;
 }

-static uint32_t get_rng_seed(uint32_t seed) {
-    if (seed == LLAMA_DEFAULT_SEED) {
-        // use system clock if std::random_device is not a true RNG
-        static bool is_rd_prng = std::random_device().entropy() == 0;
-        if (is_rd_prng) {
-            return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count();
-        }
-        std::random_device rd;
-        return rd();
-    }
-    return seed;
-}
-
 // llama_sampler API

 const char * llama_sampler_name(const struct llama_sampler * smpl) {
@@ -349,26 +335,13 @@ void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler
 struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) {
    const auto * p = (const llama_sampler_chain *) chain->ctx;

-    if (i < 0 || (size_t) i >= p->samplers.size()) {
+    if (i < 0 || i >= (int32_t) p->samplers.size()) {
        return nullptr;
    }

    return p->samplers[i];
 }

-struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) {
-    auto * p = (llama_sampler_chain *) chain->ctx;
-
-    if (i < 0 || (size_t) i >= p->samplers.size()) {
-        return nullptr;
-    }
-
-    auto * result = p->samplers[i];
-    p->samplers.erase(p->samplers.begin() + i);
-
-    return result;
-}
-
 int llama_sampler_chain_n(const struct llama_sampler * chain) {
    const auto * p = (const llama_sampler_chain *) chain->ctx;

@@ -414,7 +387,6 @@ struct llama_sampler * llama_sampler_init_greedy() {

 struct llama_sampler_dist {
    const uint32_t seed;
-          uint32_t seed_cur;

    std::mt19937 rng;
 };
@@ -444,8 +416,7 @@ static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sample

 static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
    auto * ctx = (llama_sampler_dist *) smpl->ctx;
-    ctx->seed_cur = get_rng_seed(ctx->seed);
-    ctx->rng.seed(ctx->seed_cur);
+    ctx->rng = std::mt19937(ctx->seed);
 }

 static void llama_sampler_dist_free(struct llama_sampler * smpl) {
@@ -462,13 +433,11 @@ static struct llama_sampler_i llama_sampler_dist_i = {
 };

 struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
-    auto seed_cur = get_rng_seed(seed);
    return new llama_sampler {
        /* .iface = */ &llama_sampler_dist_i,
        /* .ctx   = */ new llama_sampler_dist {
-            /* .seed     = */ seed,
-            /* .seed_cur = */ seed_cur,
-            /* .rng      = */ std::mt19937(seed_cur),
+            /* .seed = */ seed,
+            /* .rng  = */ std::mt19937(seed),
        },
    };
 }
@@ -1063,7 +1032,6 @@ struct llama_sampler_mirostat {
    const int32_t n_vocab;

    const uint32_t seed;
-          uint32_t seed_cur;

    const float tau;
    const float eta;
@@ -1132,8 +1100,7 @@ static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sa
 static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) {
    auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
    ctx->mu = 2.0f*ctx->tau;
-    ctx->seed_cur = get_rng_seed(ctx->seed);
-    ctx->rng.seed(ctx->seed_cur);
+    ctx->rng = std::mt19937(ctx->seed);
 }

 static void llama_sampler_mirostat_free(struct llama_sampler * smpl) {
@@ -1150,18 +1117,16 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
 };

 struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
-    auto seed_cur = get_rng_seed(seed);
    return new llama_sampler {
        /* .iface = */ &llama_sampler_mirostat_i,
        /* .ctx   = */ new llama_sampler_mirostat {
-            /* .n_vocab  = */ n_vocab,
-            /* .seed     = */ seed,
-            /* .seed_cur = */ seed_cur,
-            /* .tau      = */ tau,
-            /* .eta      = */ eta,
-            /* .m        = */ m,
-            /* .mu       = */ 2.0f*tau,
-            /* .rng      = */ std::mt19937(seed_cur),
+            /* .n_vocab = */ n_vocab,
+            /* .seed    = */ seed,
+            /* .tau     = */ tau,
+            /* .eta     = */ eta,
+            /* .m       = */ m,
+            /* .mu      = */ 2.0f*tau,
+            /* .rng     = */ std::mt19937(seed),
        },
    };
 }
@@ -1170,7 +1135,6 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see

 struct llama_sampler_mirostat_v2 {
    const uint32_t seed;
-          uint32_t seed_cur;

    const float tau;
    const float eta;
@@ -1215,8 +1179,7 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
 static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) {
    auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
    ctx->mu = 2.0f*ctx->tau;
-    ctx->seed_cur = get_rng_seed(ctx->seed);
-    ctx->rng.seed(ctx->seed_cur);
+    ctx->rng = std::mt19937(ctx->seed);
 }

 static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) {
@@ -1249,16 +1212,14 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
 };

 struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
-    auto seed_cur = get_rng_seed(seed);
    return new llama_sampler {
        /* .iface = */ &llama_sampler_mirostat_v2_i,
        /* .ctx   = */ new llama_sampler_mirostat_v2 {
-            /* .seed     = */ seed,
-            /* .seed_cur = */ seed_cur,
-            /* .tau      = */ tau,
-            /* .eta      = */ eta,
-            /* .mu       = */ 2.0f*tau,
-            /* .rng      = */ std::mt19937(seed_cur),
+            /* .seed  = */ seed,
+            /* .tau   = */ tau,
+            /* .eta   = */ eta,
+            /* .mu    = */ 2.0f*tau,
+            /* .rng   = */ std::mt19937(seed),
        },
    };
 }
@@ -1544,8 +1505,6 @@ struct llama_sampler * llama_sampler_init_penalties(
        ignore_eos = false;
    }

-    penalty_last_n = std::max(penalty_last_n, 0);
-
    return new llama_sampler {
        /* .iface = */ &llama_sampler_penalties_i,
        /* .ctx   = */ new llama_sampler_penalties {
@@ -1609,7 +1568,6 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
        }
    }
 }
-
 static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) {
    const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx;
    return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data());
@@ -1641,65 +1599,3 @@ struct llama_sampler * llama_sampler_init_logit_bias(
        },
    };
 }
-
-// utils
-
-uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
-    if (smpl->iface == &llama_sampler_dist_i) {
-        return ((const llama_sampler_dist *) smpl->ctx)->seed_cur;
-    }
-
-    if (smpl->iface == &llama_sampler_mirostat_i) {
-        return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur;
-    }
-
-    if (smpl->iface == &llama_sampler_mirostat_v2_i) {
-        return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur;
-    }
-
-    if (smpl->iface == &llama_sampler_chain_i) {
-        const auto * ctx = (const llama_sampler_chain *) smpl->ctx;
-        for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) {
-            const uint32_t seed = llama_sampler_get_seed(*it);
-            if (seed != LLAMA_DEFAULT_SEED) {
-                return seed;
-            }
-        }
-    }
-
-    return LLAMA_DEFAULT_SEED;
-}
-
-// perf
-
-struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
-    struct llama_perf_sampler_data data = {};
-
-    if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
-        GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
-    }
-
-    const auto * ctx = (const struct llama_sampler_chain *) chain->ctx;
-
-    data.t_sample_ms = 1e-3 * ctx->t_sample_us;
-    data.n_sample    = std::max(0, ctx->n_sample);
-
-    return data;
-}
-
-void llama_perf_sampler_print(const struct llama_sampler * chain) {
-    const auto data = llama_perf_sampler(chain);
-
-    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
-}
-
-void llama_perf_sampler_reset(struct llama_sampler * chain) {
-    if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
-        GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
-    }
-
-    auto * ctx = (struct llama_sampler_chain *) chain->ctx;
-
-    ctx->t_sample_us = ctx->n_sample = 0;
-}
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2156,10 +2156,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
    if (host_buffer) {
        buft = ggml_backend_sycl_host_buffer_type();
    }
-#elif defined(GGML_USE_CANN)
-    if (host_buffer) {
-        buft = ggml_backend_cann_host_buffer_type();
-    }
 #elif defined(GGML_USE_CPU_HBM)
    buft = ggml_backend_cpu_hbm_buffer_type();
 #elif defined(GGML_USE_VULKAN)
@@ -2486,7 +2482,6 @@ struct llama_cparams {
    bool causal_attn;
    bool offload_kqv;
    bool flash_attn;
-    bool no_perf;

    enum llama_pooling_type pooling_type;

@@ -6662,6 +6657,8 @@ static bool llm_load_tensors(
        bool use_mlock,
        llama_progress_callback progress_callback,
        void * progress_callback_user_data) {
+    model.t_start_us = ggml_time_us();
+
    auto & hparams = model.hparams;

    model.split_mode   = split_mode;
@@ -8592,13 +8589,14 @@ static bool llm_load_tensors(
        }
    }

+    // loading time will be recalculate after the first eval, so
+    // we take page faults deferred by mmap() into consideration
+    model.t_load_us = ggml_time_us() - model.t_start_us;
    return true;
 }

 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
 static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
-    model.t_start_us = ggml_time_us();
-
    try {
        llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);

@@ -8660,10 +8658,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
        return -1;
    }

-    // loading time will be recalculate after the first eval, so
-    // we take page faults deferred by mmap() into consideration
-    model.t_load_us = ggml_time_us() - model.t_start_us;
-
    return 0;
 }

@@ -15826,7 +15820,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {

            // clear unused states
            for (int i = 0; i < n_kv; ++i) {
-                const uint32_t  cell_id = i + kv_self.head;
+                uint32_t        cell_id = i + kv_self.head;
                llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];

                data[i] = (float) (kv_cell.src >= 0);
@@ -16082,21 +16076,19 @@ static int llama_decode_internal(
        return -1;
    }

+    for (uint32_t i = 0; i < n_tokens_all; ++i) {
+        if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
+            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
+            return -1;
+        }
+    }
+
    const auto & model   = lctx.model;
    const auto & hparams = model.hparams;
    const auto & cparams = lctx.cparams;

    GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT

-    if (batch_all.token) {
-        for (uint32_t i = 0; i < n_tokens_all; ++i) {
-            if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
-                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
-                return -1;
-            }
-        }
-    }
-
    GGML_ASSERT(n_tokens_all <= cparams.n_batch);

    GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
@@ -16383,21 +16375,19 @@ static int llama_encode_internal(
        return -1;
    }

+    for (uint32_t i = 0; i < n_tokens; ++i) {
+        if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
+            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
+            return -1;
+        }
+    }
+
    const auto & model   = lctx.model;
    const auto & hparams = model.hparams;
    const auto & cparams = lctx.cparams;

    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT

-    if (batch.token) {
-        for (uint32_t i = 0; i < n_tokens; ++i) {
-            if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
-                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
-                return -1;
-            }
-        }
-    }
-
    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
    GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");

@@ -17951,7 +17941,6 @@ struct llama_context_params llama_context_default_params() {
        /*.embeddings                  =*/ false,
        /*.offload_kqv                 =*/ true,
        /*.flash_attn                  =*/ false,
-        /*.no_perf                     =*/ true,
        /*.abort_callback              =*/ nullptr,
        /*.abort_callback_data         =*/ nullptr,
    };
@@ -18074,9 +18063,9 @@ struct llama_model * llama_load_model_from_file(
            unsigned percentage = (unsigned) (100 * progress);
            while (percentage > *cur_percentage_p) {
                *cur_percentage_p = percentage;
-                LLAMA_LOG(".");
+                LLAMA_LOG_INFO(".");
                if (percentage >= 100) {
-                    LLAMA_LOG("\n");
+                    LLAMA_LOG_INFO("\n");
                }
            }
            return true;
@@ -18162,7 +18151,6 @@ struct llama_context * llama_new_context_with_model(
    cparams.embeddings       = params.embeddings;
    cparams.offload_kqv      = params.offload_kqv;
    cparams.flash_attn       = params.flash_attn;
-    cparams.no_perf          = params.no_perf;
    cparams.pooling_type     = params.pooling_type;

    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
@@ -20081,14 +20069,10 @@ void llama_synchronize(struct llama_context * ctx) {

    // add the evaluation to the stats
    if (ctx->n_queued_tokens == 1) {
-        if (!ctx->cparams.no_perf) {
-            ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
-        }
+        ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
        ctx->n_eval++;
    } else if (ctx->n_queued_tokens > 1) {
-        if (!ctx->cparams.no_perf) {
-            ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
-        }
+        ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
        ctx->n_p_eval += ctx->n_queued_tokens;
    }

@@ -20684,7 +20668,6 @@ const char * llama_print_system_info(void) {
    s += "ARM_FMA = "     + std::to_string(ggml_cpu_has_arm_fma())     + " | ";
    s += "F16C = "        + std::to_string(ggml_cpu_has_f16c())        + " | ";
    s += "FP16_VA = "     + std::to_string(ggml_cpu_has_fp16_va())     + " | ";
-    s += "RISCV_VECT = "  + std::to_string(ggml_cpu_has_riscv_v())     + " | ";
    s += "WASM_SIMD = "   + std::to_string(ggml_cpu_has_wasm_simd())   + " | ";
    s += "BLAS = "        + std::to_string(ggml_cpu_has_blas())        + " | ";
    s += "SSE3 = "        + std::to_string(ggml_cpu_has_sse3())        + " | ";
@@ -20696,40 +20679,65 @@ const char * llama_print_system_info(void) {
    return s.c_str();
 }

-struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
-    struct llama_perf_context_data data = {};
+void llama_perf_print(const void * ctx, enum llama_perf_type type) {
+    switch (type) {
+        case LLAMA_PERF_TYPE_CONTEXT:
+            {
+                const auto * p = (const struct llama_context *) ctx;

-    if (ctx == nullptr) {
-        return data;
+                const double t_start_ms   = 1e-3 * p->t_start_us;
+                const double t_end_ms     = 1.00 * ggml_time_ms();
+                const double t_load_ms    = 1e-3 * p->t_load_us;
+                const double t_p_eval_ms  = 1e-3 * p->t_p_eval_us;
+                const double t_eval_ms    = 1e-3 * p->t_eval_us;
+
+                const int32_t n_p_eval  = std::max(0, p->n_p_eval);
+                const int32_t n_eval    = std::max(1, p->n_eval);
+
+                LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, t_load_ms);
+                LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+                        __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
+                LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+                        __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
+                LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
+            } break;
+        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
+            {
+                const auto * smpl = (const struct llama_sampler *) ctx;
+                const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
+
+                const double t_sampler_ms = 1e-3 * p->t_sample_us;
+
+                const int32_t n_sampler = std::max(0, p->n_sample);
+
+                LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+                        __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
+            } break;
+        default:
+            GGML_ABORT("invalid perf type");
    }
-
-    data.t_start_ms  = 1e-3 * ctx->t_start_us;
-    data.t_load_ms   = 1e-3 * ctx->t_load_us;
-    data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
-    data.t_eval_ms   = 1e-3 * ctx->t_eval_us;
-    data.n_p_eval    = std::max(1, ctx->n_p_eval);
-    data.n_eval      = std::max(1, ctx->n_eval);
-
-    return data;
 }

-void llama_perf_context_print(const struct llama_context * ctx) {
-    const auto data = llama_perf_context(ctx);
+void llama_perf_reset(void * ctx, enum llama_perf_type type) {
+    switch (type) {
+        case LLAMA_PERF_TYPE_CONTEXT:
+            {
+                auto * p = (struct llama_context *) ctx;

-    const double t_end_ms = 1e-3 * ggml_time_us();
+                p->t_start_us  = ggml_time_us();
+                p->t_eval_us   = p->n_eval = 0;
+                p->t_p_eval_us = p->n_p_eval = 0;
+            } break;
+        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
+            {
+                auto * smpl = (struct llama_sampler *) ctx;
+                auto * p = (struct llama_sampler_chain *) smpl->ctx;

-    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
-    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
-    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
-    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
-}
-
-void llama_perf_context_reset(struct llama_context * ctx) {
-    ctx->t_start_us  = ggml_time_us();
-    ctx->t_eval_us   = ctx->n_eval = 0;
-    ctx->t_p_eval_us = ctx->n_p_eval = 0;
+                p->t_sample_us = p->n_sample = 0;
+            } break;
+        default:
+            GGML_ABORT("invalid perf type");
+    }
 }

 void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
@@ -20781,8 +20789,8 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l
    if (len < 128) {
        g_state.log_callback(level, buffer, g_state.log_callback_user_data);
    } else {
-        char * buffer2 = new char[len + 1];
-        vsnprintf(buffer2, len + 1, format, args_copy);
+        char* buffer2 = new char[len+1];
+        vsnprintf(buffer2, len+1, format, args_copy);
        buffer2[len] = 0;
        g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
        delete[] buffer2;
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -108,7 +108,6 @@ llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU
 #llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)

 # llama_target_and_test(test-double-float.cpp) # SLOW
-llama_target_and_test(test-log.cpp)
 llama_target_and_test(test-arg-parser.cpp)
 llama_target_and_test(test-quantize-fns.cpp)
 llama_target_and_test(test-quantize-perf.cpp)
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -85,7 +85,7 @@ int main(void) {

    argv = {"binary_name", "--verbose"};
    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
-    assert(params.verbosity > 1);
+    assert(params.verbosity == 1);

    argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -242,12 +242,16 @@ static bool check_gradient(
    ggml_graph_cpy(gf, gb);
    ggml_build_backward_expand(ctx0, gf, gb, false);

-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
+    ggml_graph_prepare(gf, n_threads, nullptr);
+    ggml_graph_work_init(gf, ctx0);
+    ggml_graph_compute(gf);

    ggml_graph_reset  (gf);
    ggml_set_f32      (f->grad, 1.0f);

-    ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
+    ggml_graph_prepare(gb, n_threads, nullptr);
+    ggml_graph_work_init(gb, ctx0);
+    ggml_graph_compute(gb);

    // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
    // ggml_graph_dump_dot(gb, gf,  "test-grad0-backward.dot");
@@ -262,13 +266,17 @@ static bool check_gradient(
            const float xp = x0 + eps;
            ggml_set_f32_1d(x[i], k, xp);

-            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
+            ggml_graph_prepare(gf, n_threads, nullptr);
+            ggml_graph_work_init(gf, ctx0);
+            ggml_graph_compute(gf);

            const double f0 = ggml_get_f32_1d(f, 0);

            ggml_set_f32_1d(x[i], k, xm);

-            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
+            ggml_graph_prepare(gf, n_threads, nullptr);
+            ggml_graph_work_init(gf, ctx0);
+            ggml_graph_compute(gf);

            const double f1 = ggml_get_f32_1d(f, 0);
            const double g0 = (f0 - f1)/(2.0*(double) eps);
@@ -301,7 +309,9 @@ static bool check_gradient(
            ggml_graph_reset  (gf);
            ggml_set_f32      (f->grad, 1.0f);

-            ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
+            ggml_graph_prepare(gb, n_threads, nullptr);
+            ggml_graph_work_init(gb, ctx0);
+            ggml_graph_compute(gb);

            const double g1 = ggml_get_f32_1d(x[i]->grad, k);

--- a/tests/test-log.cpp
+++ b/tests/test-log.cpp
@@ -1,39 +0,0 @@
-#include "log.h"
-
-#include <cstdlib>
-#include <thread>
-
-int main() {
-    const int n_thread = 8;
-
-    std::thread threads[n_thread];
-    for (int i = 0; i < n_thread; i++) {
-        threads[i] = std::thread([i]() {
-            const int n_msg = 1000;
-
-            for (int j = 0; j < n_msg; j++) {
-                const int log_type = std::rand() % 4;
-
-                switch (log_type) {
-                    case 0: LOG_INF("Thread %d: %d\n", i, j); break;
-                    case 1: LOG_WRN("Thread %d: %d\n", i, j); break;
-                    case 2: LOG_ERR("Thread %d: %d\n", i, j); break;
-                    case 3: LOG_DBG("Thread %d: %d\n", i, j); break;
-                    default:
-                        break;
-                }
-
-                if (rand () % 10 < 5) {
-                    gpt_log_set_timestamps(gpt_log_main(), rand() % 2);
-                    gpt_log_set_prefix    (gpt_log_main(), rand() % 2);
-                }
-            }
-        });
-    }
-
-    for (int i = 0; i < n_thread; i++) {
-        threads[i].join();
-    }
-
-    return 0;
-}
--- a/tests/test-opt.cpp
+++ b/tests/test-opt.cpp
@@ -113,7 +113,10 @@ int main(void) {
    ggml_build_forward_expand(ge, e);
    ggml_graph_reset(ge);

-    ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
+    ggml_graph_prepare(ge, 1, nullptr);
+    ggml_graph_work_init(ge, nullptr);
+    ggml_graph_compute(ge);
+    ggml_graph_work_free(ge);

    const float fe = ggml_get_f32_1d(e, 0);
    printf("%s: e = %.4f\n", __func__, fe);
@@ -124,7 +127,10 @@ int main(void) {

    ggml_graph_reset(ge);

-    ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
+    ggml_graph_prepare(ge, 1, nullptr);
+    ggml_graph_work_init(ge, nullptr);
+    ggml_graph_compute(ge);
+    ggml_graph_work_free(ge);

    const float fe_opt = ggml_get_f32_1d(e, 0);
    printf("%s: original  e = %.4f\n", __func__, fe);
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -112,17 +112,6 @@ static struct ggml_tensor * get_random_tensor_f32(
    return result;
 }

-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
 int main(int /*argc*/, const char ** /*argv*/) {
    struct ggml_init_params params = {
        /* .mem_size   = */ 128*1024*1024,
@@ -130,8 +119,6 @@ int main(int /*argc*/, const char ** /*argv*/) {
        /* .no_alloc   = */ false,
    };

-    std::vector<uint8_t> work_buffer;
-
    struct ggml_context * ctx0 = ggml_init(params);

    struct ggml_tensor * x;
@@ -175,7 +162,10 @@ int main(int /*argc*/, const char ** /*argv*/) {
        ggml_build_forward_expand(gf, r1);
        ggml_build_forward_expand(gf, r2);

-        ggml_graph_compute_helper(work_buffer, gf, 4);
+        ggml_graph_prepare(gf, 4, nullptr);
+        ggml_graph_work_init(gf, nullptr);
+        ggml_graph_compute(gf);
+        ggml_graph_work_free(gf);

        // check that r1 and r2 are the same
        {
Author	SHA1	Message	Date
Georgi Gerganov	f9968f661d	ggml : update comments [no ci]	2024-09-11 13:16:39 +03:00
Georgi Gerganov	119e0bc9ae	ggml : remove ggml_cplan + rework ggml_cgraph ggml-ci	2024-09-11 13:05:10 +03:00
Georgi Gerganov	ee154457dd	ggml : fix compiler warnings Some checks failed flake8 Lint / Lint (push) Has been cancelled ggml-ci	2024-09-11 13:03:18 +03:00
Georgi Gerganov	92a96865cd	ggml : add ggml-impl.h to backends	2024-09-11 10:07:21 +03:00
Georgi Gerganov	c8a3f291fe	ggml : hide ggml_object, ggml_cgraph, ggml_hash_set ggml-ci	2024-09-10 16:42:16 +03:00