cont : fix comments

context : simplify output counting logic during decode
ggml-ci
2026-04-23 16:37:33 +03:00 · 2025-06-12 10:43:55 +03:00 · 2025-06-12 10:35:09 +03:00 · 2025-06-12 10:10:45 +03:00
22 changed files with 252 additions and 343 deletions
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -49,23 +49,19 @@ COPY --from=build /app/full /app

 WORKDIR /app

-RUN apt-get update && \
-    apt-get install -y \
-        git \
-        python3 \
-        python3-pip \
-        python3-venv && \
-    python3 -m venv /opt/venv && \
-    . /opt/venv/bin/activate && \
-    pip install --upgrade pip setuptools wheel && \
-    pip install -r requirements.txt && \
-    apt autoremove -y && \
-    apt clean -y && \
-    rm -rf /tmp/* /var/tmp/* && \
-    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
-    find /var/cache -type f -delete
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete

-ENV PATH="/opt/venv/bin:$PATH"

 ENTRYPOINT ["/app/tools.sh"]

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,14 +89,6 @@ option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)

-if (NOT DEFINED LLAMA_BUILD_NUMBER)
-    set(LLAMA_BUILD_NUMBER        ${BUILD_NUMBER})
-endif()
-if (NOT DEFINED LLAMA_BUILD_COMMIT)
-    set(LLAMA_BUILD_COMMIT        ${BUILD_COMMIT})
-endif()
-set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
-
 # override ggml options
 set(GGML_ALL_WARNINGS   ${LLAMA_ALL_WARNINGS})
 set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
@@ -163,8 +155,6 @@ if (LLAMA_USE_SYSTEM_GGML)
 endif()

 if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
-    set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER})
-    set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
    add_subdirectory(ggml)
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()
@@ -214,6 +204,10 @@ endif()
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)

+set(LLAMA_BUILD_NUMBER        ${BUILD_NUMBER})
+set(LLAMA_BUILD_COMMIT        ${BUILD_COMMIT})
+set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
+
 set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
 set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
 set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 [![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
 [![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)

-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
+[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)

 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++

@@ -18,6 +18,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 ## Hot topics

 - 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
+- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
 - A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -23,21 +23,31 @@ if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
    endif()

    if(EXISTS "${GIT_DIR}/index")
-        # For build-info.cpp below
-        set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
+        set(GIT_INDEX "${GIT_DIR}/index")
    else()
        message(WARNING "Git index not found in git repository.")
+        set(GIT_INDEX "")
    endif()
 else()
    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
+    set(GIT_INDEX "")
 endif()

-set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
-set(OUTPUT_FILE   "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
-configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
-
+# Add a custom command to rebuild build-info.cpp when .git/index changes
+add_custom_command(
+    OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
+    COMMENT "Generating build details from Git"
+    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
+            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+            -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
+            -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
+    WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
+    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
+    VERBATIM
+)
 set(TARGET build_info)
-add_library(${TARGET} OBJECT ${OUTPUT_FILE})
+add_library(${TARGET} OBJECT build-info.cpp)
 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
--- a/common/build-info.cpp.in
+++ b/common/build-info.cpp.in
@@ -1,4 +1,4 @@
-int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
-char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
+int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
+char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
 char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
 char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
--- a/common/cmake/build-info-gen-cpp.cmake
+++ b/common/cmake/build-info-gen-cpp.cmake
@@ -0,0 +1,24 @@
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
+
+set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
+set(OUTPUT_FILE   "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
+
+# Only write the build info if it changed
+if(EXISTS ${OUTPUT_FILE})
+    file(READ ${OUTPUT_FILE} CONTENTS)
+    string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
+    set(OLD_COMMIT ${CMAKE_MATCH_1})
+    string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
+    set(OLD_COMPILER ${CMAKE_MATCH_1})
+    string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
+    set(OLD_TARGET ${CMAKE_MATCH_1})
+    if (
+        NOT OLD_COMMIT   STREQUAL BUILD_COMMIT   OR
+        NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
+        NOT OLD_TARGET   STREQUAL BUILD_TARGET
+    )
+        configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
+    endif()
+else()
+    configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
+endif()
--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@@ -107,7 +107,3 @@ NOTE: some models may require large context window, for example: `-c 8192`
 (tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
 (tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
 ```
-
-## Finding more models:
-
-GGUF models on Huggingface with vision capabilities can be found here: https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending&search=gguf
--- a/ggml/src/ggml-metal/CMakeLists.txt
+++ b/ggml/src/ggml-metal/CMakeLists.txt
@@ -44,22 +44,21 @@ if (GGML_METAL_EMBED_LIBRARY)
    set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")

    add_custom_command(
-        OUTPUT "${METALLIB_EMBED_ASM}"
+        OUTPUT ${METALLIB_EMBED_ASM}
        COMMAND echo "Embedding Metal library"
-        COMMAND sed -e "/__embed_ggml-common.h__/r ${METALLIB_COMMON}"       -e "/__embed_ggml-common.h__/d"         < "${METALLIB_SOURCE}"           > "${METALLIB_SOURCE_EMBED_TMP}"
-        COMMAND sed -e "/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}" -e "/\#include \"ggml-metal-impl.h\"/d" < "${METALLIB_SOURCE_EMBED_TMP}" > "${METALLIB_SOURCE_EMBED}"
-        COMMAND echo ".section __DATA,__ggml_metallib"          >  "${METALLIB_EMBED_ASM}"
-        COMMAND echo ".globl _ggml_metallib_start"              >> "${METALLIB_EMBED_ASM}"
-        COMMAND echo "_ggml_metallib_start:"                    >> "${METALLIB_EMBED_ASM}"
-        COMMAND echo .incbin "\"${METALLIB_SOURCE_EMBED}\""     >> "${METALLIB_EMBED_ASM}"
-        COMMAND echo ".globl _ggml_metallib_end"                >> "${METALLIB_EMBED_ASM}"
-        COMMAND echo "_ggml_metallib_end:"                      >> "${METALLIB_EMBED_ASM}"
+        COMMAND sed -e '/__embed_ggml-common.h__/r         ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d'         < ${METALLIB_SOURCE}           > ${METALLIB_SOURCE_EMBED_TMP}
+        COMMAND sed -e '/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}'   -e '/\#include \"ggml-metal-impl.h\"/d' < ${METALLIB_SOURCE_EMBED_TMP} > ${METALLIB_SOURCE_EMBED}
+        COMMAND echo ".section __DATA,__ggml_metallib"          >  ${METALLIB_EMBED_ASM}
+        COMMAND echo ".globl _ggml_metallib_start"              >> ${METALLIB_EMBED_ASM}
+        COMMAND echo "_ggml_metallib_start:"                    >> ${METALLIB_EMBED_ASM}
+        COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
+        COMMAND echo ".globl _ggml_metallib_end"                >> ${METALLIB_EMBED_ASM}
+        COMMAND echo "_ggml_metallib_end:"                      >> ${METALLIB_EMBED_ASM}
        DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h
        COMMENT "Generate assembly for embedded Metal library"
-        VERBATIM
    )

-    target_sources(ggml-metal PRIVATE "${METALLIB_EMBED_ASM}")
+    target_sources(ggml-metal PRIVATE ${METALLIB_EMBED_ASM})
 else()
    if (GGML_METAL_SHADER_DEBUG)
        # custom command to do the following:
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -142,7 +142,7 @@ else()
        FetchContent_Declare(
            ONEMATH
            GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
-            GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142
+            GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a
        )
        FetchContent_MakeAvailable(ONEMATH)
        # Create alias to match with find_package targets name
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -513,9 +513,9 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {

 bool gpu_has_xmx(sycl::device &dev);

-template <int N, class T> std::string debug_get_array_str(const std::string & prefix, const T array[N]) {
+template <int N, class T> void debug_print_array(const std::string & prefix, const T array[N]) {
    if (LIKELY(!g_ggml_sycl_debug)) {
-        return "";
+        return;
    }
    std::stringstream ss;
    ss << prefix << "=[";
@@ -526,26 +526,29 @@ template <int N, class T> std::string debug_get_array_str(const std::string & pr
        ss << array[N - 1];
    }
    ss << "]";
-    return ss.str();
+    GGML_SYCL_DEBUG("%s", ss.str().c_str());
 }

-inline std::string debug_get_tensor_str(const std::string &prefix,
-        const ggml_tensor *tensor, const std::string &suffix = "") {
-    std::stringstream ss;
-    if (LIKELY(!g_ggml_sycl_debug)) { return ss.str(); }
-    ss << prefix.c_str() << "=";
-    if (tensor) {
-        ss << "'" << tensor->name << "':type=" << ggml_type_name(tensor->type);
-        ss << debug_get_array_str<GGML_MAX_DIMS>(";ne", tensor->ne);
-        ss << debug_get_array_str<GGML_MAX_DIMS>(";nb", tensor->nb);
-
-        if (!ggml_is_contiguous(tensor)) { ss << ";strided"; }
-        if (ggml_is_permuted(tensor)) { ss << ";permuted"; }
-    } else {
-        ss << "nullptr";
+inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor,
+                               const std::string & suffix = "") {
+    if (LIKELY(!g_ggml_sycl_debug)) {
+        return;
    }
-    ss << suffix;
-    return ss.str();
+    GGML_SYCL_DEBUG("%s=", prefix.c_str());
+    if (tensor) {
+        GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type));
+        debug_print_array<GGML_MAX_DIMS>(";ne", tensor->ne);
+        debug_print_array<GGML_MAX_DIMS>(";nb", tensor->nb);
+        if (!ggml_is_contiguous(tensor)) {
+            GGML_SYCL_DEBUG(";strided");
+        }
+        if (ggml_is_permuted(tensor)) {
+            GGML_SYCL_DEBUG(";permuted");
+        }
+    } else {
+        GGML_SYCL_DEBUG("nullptr");
+    }
+    GGML_SYCL_DEBUG("%s", suffix.c_str());
 }

 // Use scope_op_debug_print to log operations coming from running a model
@@ -561,10 +564,10 @@ struct scope_op_debug_print {
            return;
        }
        GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
-        GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" dst", dst).c_str());
+        debug_print_tensor(" dst", dst);
        if (dst) {
            for (std::size_t i = 0; i < num_src; ++i) {
-                GGML_SYCL_DEBUG("%s", debug_get_tensor_str("\tsrc" + std::to_string(i), dst->src[i]).c_str());
+                debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]);
            }
        }
        GGML_SYCL_DEBUG("%s\n", suffix.data());
--- a/ggml/src/ggml-sycl/cpy.cpp
+++ b/ggml/src/ggml-sycl/cpy.cpp
@@ -723,7 +723,8 @@ static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const

 void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
    // Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
-    scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0, debug_get_tensor_str("\tsrc0", src0));
+    scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
+                                         std::string(" src0 type=") + ggml_type_name(src0->type));
    const int64_t ne = ggml_nelements(src0);
    GGML_ASSERT(ne == ggml_nelements(src1));

--- a/ggml/src/ggml-sycl/gemm.hpp
+++ b/ggml/src/ggml-sycl/gemm.hpp
@@ -65,9 +65,6 @@ public:

        dnnl::primitive_attr primitive_attr;
        primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-#ifdef GGML_SYCL_F16
-        primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16);
-#endif

        auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
        auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -347,7 +347,7 @@ static enum ggml_status
 ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                     ggml_tensor *tensor) try {
    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
+    debug_print_tensor(": tensor=", tensor, "\n");
    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;

    if (tensor->view_src != NULL) {
@@ -385,7 +385,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                                const void *data, size_t offset,
                                                size_t size) try {
    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
+    debug_print_tensor(": tensor=", tensor);
    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
    ggml_sycl_set_device(ctx->device);
@@ -413,7 +413,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                                void *data, size_t offset,
                                                size_t size) try {
    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
+    debug_print_tensor(": tensor=", tensor);
    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;

@@ -444,8 +444,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
                                    ggml_tensor *dst) try {
    bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
-    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
+    debug_print_tensor(": dst=", dst);
+    debug_print_tensor(" src=", src);
    GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
    if (is_cpy_supported) {
        ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
@@ -525,7 +525,7 @@ catch (sycl::exception const &exc) {
 static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
                                                   size_t offset, size_t size) {
    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
+    debug_print_tensor(": tensor=", tensor);
    GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
    SYCL_CHECK(ggml_sycl_set_device(ctx->device));
@@ -805,7 +805,7 @@ static enum ggml_status
 ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                           ggml_tensor *tensor) try {
    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
+    debug_print_tensor(": tensor=", tensor, "\n");
    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported

    ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
@@ -891,7 +891,7 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                          ggml_tensor *tensor, const void *data,
                                          size_t offset, size_t size) try {
    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
+    debug_print_tensor(": tensor=", tensor);
    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
    // split tensors must always be set in their entirety at once
    GGML_ASSERT(offset == 0);
@@ -947,7 +947,7 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                          const ggml_tensor *tensor, void *data,
                                          size_t offset, size_t size) try {
    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
+    debug_print_tensor(": tensor=", tensor);
    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
    // split tensors must always be set in their entirety at once
    GGML_ASSERT(offset == 0);
@@ -2127,18 +2127,21 @@ inline void ggml_sycl_op_mul_mat_sycl(
        const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
                ? (const sycl::half *)src1->data + src1_padded_row_size
                                         : src1_as_f16.get();
+        ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);

 #if GGML_SYCL_DNNL
        if (!g_ggml_sycl_disable_dnn) {
            DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
                                      DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
-                                      dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
+                                      dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
+            scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
+                                                 " : converting dst to fp32");
+            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
+            to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
        }
        else
 #endif
        {
-            ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
-
            const sycl::half alpha_f16 = 1.0f;
            const sycl::half beta_f16  = 0.0f;
            SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
@@ -3863,7 +3866,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
                                               const void *data, size_t offset,
                                               size_t size) try {
    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
+    debug_print_tensor(": tensor=", tensor);
    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -3884,7 +3887,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
                                               void *data, size_t offset,
                                               size_t size) try {
    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
+    debug_print_tensor(": tensor=", tensor);
    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -3907,8 +3910,8 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
    bool is_cpy_supported                = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
                            ggml_backend_buffer_is_sycl(src->buffer);
    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
-    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
+    debug_print_tensor(": dst=", dst);
+    debug_print_tensor(" src=", src);
    GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
    if (is_cpy_supported) {
        /*
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@@ -1,13 +1,8 @@
 #include "llama-batch.h"

-#include "llama-impl.h"
-#include "llama-cparams.h"
-#include "llama-vocab.h"
-
 #include <cassert>
 #include <cstring>
 #include <algorithm>
-#include <sstream>

 llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
    // clear empty sequences
@@ -284,45 +279,9 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
            );
 }

-llama_batch_allocr::llama_batch_allocr() {
-    const char * LLAMA_BATCH_DEBUG = getenv("LLAMA_BATCH_DEBUG");
-    debug = LLAMA_BATCH_DEBUG ? atoi(LLAMA_BATCH_DEBUG) : 0;
-}
-
-bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab & vocab, llama_pos p0) {
-    clear();
-
-    batch = batch_inp;
-
+llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0) {
+    batch = in_batch;
    GGML_ASSERT(batch.n_tokens > 0);
-
-    if (!batch.pos) {
-        if (batch.seq_id) {
-            LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
-            return false;
-        }
-    }
-
-    if (batch.token) {
-        for (int32_t i = 0; i < batch.n_tokens; ++i) {
-            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
-                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
-                return false;
-            }
-        }
-    }
-
-    if (batch.seq_id) {
-        for (int32_t i = 0; i < batch.n_tokens; ++i) {
-            for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
-                if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
-                    LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_PARALLEL_SEQUENCES);
-                    return false;
-                }
-            }
-        }
-    }
-
    if (!batch.pos) {
        assert(p0 >= 0);
        pos.resize(batch.n_tokens);
@@ -331,7 +290,6 @@ bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab &
        }
        batch.pos = pos.data();
    }
-
    if (!batch.n_seq_id) {
        n_seq_id.resize(batch.n_tokens);
        for (int32_t i = 0; i < batch.n_tokens; i++) {
@@ -339,7 +297,6 @@ bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab &
        }
        batch.n_seq_id = n_seq_id.data();
    }
-
    if (!batch.seq_id) {
        seq_id.resize(batch.n_tokens + 1);
        seq_id[batch.n_tokens] = NULL;
@@ -348,84 +305,12 @@ bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab &
        }
        batch.seq_id = seq_id.data();
    }
-
    if (!batch.logits) {
        // by default return the output only for the last token
        output.resize(batch.n_tokens);
        output[output.size() - 1] = true;
        batch.logits = output.data();
    }
-
-    for (int32_t i = 0; i < batch.n_tokens; ++i) {
-        n_outputs += batch.logits[i] != 0;
-    }
-
-    if (debug > 0) {
-        LLAMA_LOG_DEBUG("%s: input batch info (p0 = %d):\n", __func__, p0);
-        LLAMA_LOG_DEBUG("%s:   n_tokens  = %d\n", __func__, batch.n_tokens);
-        LLAMA_LOG_DEBUG("%s:   token     = %p\n", __func__, (void *) batch.token);
-        LLAMA_LOG_DEBUG("%s:   embd      = %p\n", __func__, (void *) batch.embd);
-        LLAMA_LOG_DEBUG("%s:   pos       = %p\n", __func__, (void *) batch.pos);
-        LLAMA_LOG_DEBUG("%s:   n_seq_id  = %p\n", __func__, (void *) batch.n_seq_id);
-        LLAMA_LOG_DEBUG("%s:   seq_id    = %p\n", __func__, (void *) batch.seq_id);
-        LLAMA_LOG_DEBUG("%s:   logits    = %p\n", __func__, (void *) batch.logits);
-        LLAMA_LOG_DEBUG("%s:   n_outputs = %d\n", __func__, n_outputs);
-
-        if (debug > 1) {
-            int seq_id_max = 0;
-            for (int32_t i = 0; i < batch.n_tokens; ++i) {
-                for (int s = 0; s < batch.n_seq_id[i]; ++s) {
-                    for (int s = 0; s < batch.n_seq_id[i]; ++s) {
-                        seq_id_max = std::max(seq_id_max, batch.seq_id[i][s]);
-                    }
-                }
-            }
-            ++seq_id_max;
-
-            LLAMA_LOG_DEBUG("%s:   token     = [\n", __func__);
-            for (int32_t i = 0; i < batch.n_tokens; ++i) {
-                std::vector<int8_t> seq_id(seq_id_max);
-
-                for (int s = 0; s < batch.n_seq_id[i]; ++s) {
-                    seq_id[batch.seq_id[i][s]] = 1;
-                }
-
-                std::stringstream ss;
-                for (int s = 0; s < seq_id_max; ++s) {
-                    if (seq_id[s]) {
-                        ss << s%10;
-                    } else {
-                        ss << ".";
-                    }
-                }
-
-                LLAMA_LOG_DEBUG("%s:  %4d: id = %6d (%16s), pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
-                        __func__, i, batch.token[i], vocab.token_to_piece(batch.token[i]).c_str(),
-                        batch.pos[i], batch.n_seq_id[i], ss.str().c_str(), batch.logits[i]);
-            }
-            LLAMA_LOG_DEBUG("%s:   ]\n", __func__);
-        }
-    }
-
-    return true;
-}
-
-const llama_batch & llama_batch_allocr::get_batch() const {
-    return batch;
-}
-
-uint32_t llama_batch_allocr::get_n_outputs() const {
-    return n_outputs;
-}
-
-void llama_batch_allocr::clear() {
-    n_outputs = 0;
-
-    batch = {};
-    pos.clear();
-    n_seq_id.clear();
-    seq_id.clear();
-    output.clear();
 }

 //
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@@ -18,8 +18,8 @@ struct llama_ubatch {
    llama_token  *  token;    // [n_tokens]
    float        *  embd;     // [n_embd, n_tokens]
    llama_pos    *  pos;      // [n_tokens]
-    int32_t      *  n_seq_id; // [n_seqs]
-    llama_seq_id ** seq_id;   // [n_seqs]
+    int32_t      *  n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence
+    llama_seq_id ** seq_id;   // [n_seqs] // TODO: become llama_seq_id * seq_id;
    int8_t       *  output;   // [n_tokens]
 };

@@ -78,30 +78,15 @@ struct llama_sbatch {
 };

 // temporary allocate memory for the input batch if needed
-class llama_batch_allocr {
-public:
-    llama_batch_allocr();
-
-    // optionally fulfill the batch returned by llama_batch_get_one
-    bool init(const llama_batch & batch_inp, const llama_vocab & vocab, llama_pos p0);
-
-    const llama_batch & get_batch() const;
-
-    uint32_t get_n_outputs() const;
-
-private:
-    void clear();
-
-    llama_batch batch;
-
-    uint32_t n_outputs;
+struct llama_batch_allocr {
+    struct llama_batch batch;

    std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
-
    std::vector<llama_pos>      pos;
    std::vector<int32_t>        n_seq_id;
    std::vector<llama_seq_id *> seq_id;
    std::vector<int8_t>         output;

-    int debug;
+    // optionally fulfill the batch returned by llama_batch_get_one
+    llama_batch_allocr(struct llama_batch in_batch, llama_pos p0);
 };
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1,7 +1,6 @@
 #include "llama-context.h"

 #include "llama-impl.h"
-#include "llama-batch.h"
 #include "llama-io.h"
 #include "llama-memory.h"
 #include "llama-mmap.h"
@@ -19,8 +18,7 @@
 llama_context::llama_context(
        const llama_model & model,
              llama_context_params params) :
-    model(model),
-    batch_allocr(std::make_unique<llama_batch_allocr>()) {
+    model(model) {
    LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);

    t_start_us = model.t_start_us;
@@ -496,7 +494,7 @@ float * llama_context::get_logits() {
 }

 float * llama_context::get_logits_ith(int32_t i) {
-    int64_t j = -1;
+    int32_t j = -1;

    try {
        if (logits == nullptr) {
@@ -519,7 +517,7 @@ float * llama_context::get_logits_ith(int32_t i) {
        }
        if (j >= n_outputs) {
            // This should not happen
-            throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
+            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
        }

        return logits + j*model.vocab.n_tokens();
@@ -538,7 +536,7 @@ float * llama_context::get_embeddings() {
 }

 float * llama_context::get_embeddings_ith(int32_t i) {
-    int64_t j = -1;
+    int32_t j = -1;

    try {
        if (embd == nullptr) {
@@ -561,7 +559,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
        }
        if (j >= n_outputs) {
            // This should not happen
-            throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
+            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
        }

        return embd + j*model.hparams.n_embd;
@@ -721,27 +719,40 @@ llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch,
    return res;
 }

-int llama_context::encode(const llama_batch & batch_inp) {
-    if (batch_inp.n_tokens == 0) {
+int llama_context::encode(llama_batch & inp_batch) {
+    if (inp_batch.n_tokens == 0) {
        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
        return -1;
    }

    // temporary allocate memory for the input batch if needed
    // note: during encode, we always pass the full sequence starting from pos = 0
-    if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : 0)) {
-        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
-        return -1;
-    }
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : 0);

-    const llama_batch & batch = batch_allocr->get_batch();
+    const llama_batch & batch = batch_allocr.batch;
+    const int32_t n_tokens = batch.n_tokens;

-    const uint32_t n_tokens = batch.n_tokens;
+    const auto & hparams = model.hparams;

    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT

+    // TODO: move the validation to the llama_batch_allocr
+    if (batch.token) {
+        for (int32_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+                return -1;
+            }
+
+            if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
+                LLAMA_LOG_ERROR("%s: invalid seq_id[%d] = %d > %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
+                throw -1;
+            }
+        }
+    }
+
    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
-    GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
+    GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens");

    if (t_compute_start_us == 0) {
        t_compute_start_us = ggml_time_us();
@@ -752,8 +763,6 @@ int llama_context::encode(const llama_batch & batch_inp) {

    n_queued_tokens += n_tokens;

-    const auto & hparams = model.hparams;
-
    const int64_t n_embd = hparams.n_embd;

    llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true);
@@ -766,7 +775,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
        return -2;
    };

-    for (uint32_t i = 0; i < n_tokens; ++i) {
+    for (int32_t i = 0; i < n_tokens; ++i) {
        output_ids[i] = i;
    }

@@ -822,8 +831,7 @@ int llama_context::encode(const llama_batch & batch_inp) {

                    GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits

-                    // TODO: fix indexing [UBATCH_IDX]
-                    for (uint32_t i = 0; i < n_tokens; i++) {
+                    for (int32_t i = 0; i < n_tokens; i++) {
                        const llama_seq_id seq_id = ubatch.seq_id[i][0];
                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
                            continue;
@@ -838,7 +846,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
                    auto & embd_seq_out = embd_seq;
                    const uint32_t n_cls_out = hparams.n_cls_out;

-                    // TODO: fix indexing [UBATCH_IDX]
                    for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
                        const llama_seq_id seq_id = ubatch.seq_id[s][0];
                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
@@ -871,11 +878,13 @@ int llama_context::encode(const llama_batch & batch_inp) {
        memcpy(cross.v_embd.data(), embd, ggml_nbytes(t_embd));

        // remember the sequence ids used during the encoding - needed for cross attention later
+        // TODO: the seuqence indexing here is likely not correct in the general case
+        //       probably works only for split_simple
        cross.seq_ids_enc.resize(n_tokens);
-        for (uint32_t i = 0; i < n_tokens; i++) {
+        for (int32_t i = 0; i < n_tokens; i++) {
            cross.seq_ids_enc[i].clear();
-            for (int s = 0; s < batch.n_seq_id[i]; s++) {
-                llama_seq_id seq_id = batch.seq_id[i][s];
+            for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
+                llama_seq_id seq_id = ubatch.seq_id[i][s];
                cross.seq_ids_enc[i].insert(seq_id);
            }
        }
@@ -884,44 +893,68 @@ int llama_context::encode(const llama_batch & batch_inp) {
    return 0;
 }

-int llama_context::decode(const llama_batch & batch_inp) {
+int llama_context::decode(llama_batch & inp_batch) {
    if (!memory) {
        LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
-        return encode(batch_inp);
+        return encode(inp_batch);
    }

-    if (batch_inp.n_tokens == 0) {
+    if (inp_batch.n_tokens == 0) {
        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
        return -1;
    }

-    // temporary allocate memory for the input batch if needed
-    if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : memory->seq_pos_max(0) + 1)) {
-        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
-        return -1;
+    if (!inp_batch.pos) {
+        if (inp_batch.seq_id) {
+            LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
+            return -1;
+        }
    }

-    const llama_batch & batch = batch_allocr->get_batch();
+    // temporary allocate memory for the input batch if needed
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : memory->seq_pos_max(0) + 1);
+
+    const llama_batch & batch = batch_allocr.batch;

    const auto & vocab   = model.vocab;
    const auto & hparams = model.hparams;

    const int32_t n_vocab = vocab.n_tokens();
-    const int64_t n_embd  = hparams.n_embd;

-    const uint32_t n_tokens_all = batch.n_tokens;
+    const int64_t n_tokens_all = batch.n_tokens;
+    const int64_t n_embd       = hparams.n_embd;

    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT

+    // TODO: move the validation to the llama_batch_allocr
+    if (batch.token) {
+        for (int64_t i = 0; i < n_tokens_all; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
+                return -1;
+            }
+
+            if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
+                LLAMA_LOG_ERROR("%s: invalid seq_id[%" PRId64 "] = %d >= %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
+                return -1;
+            }
+        }
+    }
+
    // this indicates we are doing pooled embedding
    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;

-    const uint32_t n_outputs_all = batch_allocr->get_n_outputs();
+    int64_t n_outputs_all = 0;
+
+    // count outputs
+    for (uint32_t i = 0; i < n_tokens_all; ++i) {
+        n_outputs_all += batch.logits[i] != 0;
+    }

    if (embd_pooled) {
        // require that all tokens are output
        if (n_outputs_all != n_tokens_all) {
-            LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %d, n_tokens_all = %d)\n",
+            LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %" PRId64 ", n_tokens_all = %" PRId64 ")\n",
                    __func__, n_outputs_all, n_tokens_all);
            return -1;
        }
@@ -991,7 +1024,7 @@ int llama_context::decode(const llama_batch & batch_inp) {

    // reserve output buffer
    if (output_reserve(n_outputs_all) < n_outputs_all) {
-        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
        return -2;
    };

@@ -1030,7 +1063,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
                pos_min[s] = std::numeric_limits<llama_pos>::max();
            }

-            // TODO: fix sequence indexing
            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
                const auto & seq_id = ubatch.seq_id[i][0];

@@ -1144,14 +1176,14 @@ int llama_context::decode(const llama_batch & batch_inp) {
    n_outputs = n_outputs_all;

    // set output mappings
-    if (n_outputs > 0) {
+    {
        bool sorted_output = true;

        auto & out_ids = mstate->out_ids();

-        GGML_ASSERT(out_ids.size() == (size_t) n_outputs);
+        GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all);

-        for (int64_t i = 0; i < n_outputs; ++i) {
+        for (int64_t i = 0; i < n_outputs_all; ++i) {
            int64_t out_id = out_ids[i];
            output_ids[out_id] = i;
            if (out_id != i) {
@@ -1163,22 +1195,20 @@ int llama_context::decode(const llama_batch & batch_inp) {
        // note: this is mostly relevant for recurrent models atm
        if (!sorted_output) {
            const uint32_t n_vocab = model.vocab.n_tokens();
-            const uint64_t n_embd  = model.hparams.n_embd;
+            const uint32_t n_embd  = model.hparams.n_embd;

            GGML_ASSERT((size_t) n_outputs == out_ids.size());

            // TODO: is there something more efficient which also minimizes swaps?
            // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
-            for (uint32_t i = 0; i < n_outputs - 1; ++i) {
-                uint32_t j_min = i;
-                for (uint32_t j = i + 1; j < n_outputs; ++j) {
+            for (int32_t i = 0; i < n_outputs - 1; ++i) {
+                int32_t j_min = i;
+                for (int32_t j = i + 1; j < n_outputs; ++j) {
                    if (out_ids[j] < out_ids[j_min]) {
                        j_min = j;
                    }
                }
-                if (j_min == i) {
-                    continue;
-                }
+                if (j_min == i) { continue; }
                std::swap(out_ids[i], out_ids[j_min]);
                if (logits_size > 0) {
                    for (uint32_t k = 0; k < n_vocab; k++) {
@@ -1191,10 +1221,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
                    }
                }
            }
-
            std::fill(output_ids.begin(), output_ids.end(), -1);
-
-            for (uint32_t i = 0; i < n_outputs; ++i) {
+            for (int32_t i = 0; i < n_outputs; ++i) {
                output_ids[out_ids[i]] = i;
            }
        }
@@ -1214,7 +1242,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
 // output
 //

-uint32_t llama_context::output_reserve(int32_t n_outputs) {
+int32_t llama_context::output_reserve(int32_t n_outputs) {
    const auto & hparams = model.hparams;
    const auto & vocab   = model.vocab;

@@ -1280,7 +1308,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
    // set all ids as invalid (negative)
    std::fill(output_ids.begin(), output_ids.end(), -1);

-    this->n_outputs = 0;
+    this->n_outputs     = 0;
+    this->n_outputs_max = n_outputs_max;

    return n_outputs_max;
 }
@@ -1771,12 +1800,14 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {

        std::vector<int32_t> w_output_pos;

+        GGML_ASSERT(n_outputs <= n_outputs_max);
+
        w_output_pos.resize(n_outputs);

        // build a more compact representation of the output ids
        for (size_t i = 0; i < n_batch(); ++i) {
            // map an output id to a position in the batch
-            int64_t pos = output_ids[i];
+            int32_t pos = output_ids[i];
            if (pos >= 0) {
                GGML_ASSERT(pos < n_outputs);
                w_output_pos[pos] = i;
@@ -2051,7 +2082,7 @@ void llama_context::opt_epoch_iter(

        embd_seq.clear();

-        uint32_t n_outputs_all = n_tokens_all;
+        int64_t n_outputs_all = n_tokens_all;

        auto mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled);
        if (!mstate || mstate->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) {
@@ -2061,7 +2092,7 @@ void llama_context::opt_epoch_iter(

        // reserve output buffer
        if (output_reserve(n_outputs_all) < n_outputs_all) {
-            LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
+            LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
            GGML_ABORT("TODO: handle this error");
        };

--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -1,6 +1,7 @@
 #pragma once

 #include "llama.h"
+#include "llama-batch.h"
 #include "llama-cparams.h"
 #include "llama-graph.h"
 #include "llama-adapter.h"
@@ -12,7 +13,6 @@
 #include <vector>

 struct llama_model;
-class llama_batch_allocr;

 class llama_io_read_i;
 class llama_io_write_i;
@@ -102,8 +102,8 @@ struct llama_context {
            llama_memory_state_i * mstate,
                     ggml_status & ret);

-    int encode(const llama_batch & batch_inp);
-    int decode(const llama_batch & batch_inp);
+    int encode(llama_batch & inp_batch);
+    int decode(llama_batch & inp_batch);

    //
    // state save/load
@@ -181,7 +181,7 @@ private:

    // Make sure enough space is available for outputs.
    // Returns max number of outputs for which space was reserved.
-    uint32_t output_reserve(int32_t n_outputs);
+    int32_t output_reserve(int32_t n_outputs);

    //
    // graph
@@ -246,10 +246,8 @@ private:
    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
    std::map<llama_seq_id, std::vector<float>> embd_seq;

-    // reuse the batch_allocr to avoid unnecessary memory allocations
-    std::unique_ptr<llama_batch_allocr> batch_allocr;
-
-    uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
+    int32_t n_outputs     = 0; // number of actually-used outputs in the current ubatch or last logical batch
+    int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers

    std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers

--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -139,7 +139,6 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {

        std::vector<uint64_t> sum(n_tokens, 0);

-        // TODO: fix indexing [UBATCH_IDX]
        for (int s = 0; s < n_seqs; ++s) {
            const llama_seq_id seq_id = ubatch->seq_id[s][0];

@@ -157,7 +156,6 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
            }
        }

-        // TODO: fix indexing [UBATCH_IDX]
        for (int s = 0; s < n_seqs; ++s) {
            const llama_seq_id seq_id = ubatch->seq_id[s][0];

@@ -182,7 +180,6 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
        uint32_t * data = (uint32_t *) cls->data;
        memset(cls->data, 0, n_tokens * ggml_element_size(cls));

-        // TODO: fix indexing [UBATCH_IDX]
        for (int s = 0; s < n_seqs; ++s) {
            const llama_seq_id seq_id = ubatch->seq_id[s][0];

@@ -213,7 +210,6 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
        std::vector<int> last_pos(n_tokens, -1);
        std::vector<int> last_row(n_tokens, -1);

-        // TODO: fix indexing [UBATCH_IDX]
        for (int s = 0; s < n_seqs; ++s) {
            const llama_seq_id seq_id = ubatch->seq_id[s][0];

@@ -287,7 +283,6 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
                                const int32_t ti = s0*n_seq_tokens + i;
                                float f = -INFINITY;

-                                // TODO: fix indexing [UBATCH_IDX]
                                for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
                                    if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) {
                                        if (hparams.use_alibi) {
@@ -327,7 +322,6 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
                                const int32_t ti = s0*n_seq_tokens + i;
                                float f = -INFINITY;

-                                // TODO: fix indexing [UBATCH_IDX]
                                for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
                                    if (ubatch->seq_id[s0][s] == seq_id) {
                                        if (hparams.use_alibi) {
@@ -383,7 +377,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
            for (int j = 0; j < n_tokens; ++j) {
                for (int i = 0; i < n_enc; ++i) {
                    float f = -INFINITY;
-                    // TODO: fix indexing [UBATCH_IDX]
                    for (int s = 0; s < ubatch->n_seq_id[j]; ++s) {
                        const llama_seq_id seq_id = ubatch->seq_id[j][s];
                        if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) {
@@ -1563,30 +1556,23 @@ void llm_graph_context::build_pooling(
                ggml_tensor * inp_cls = build_inp_cls();
                inp = ggml_get_rows(ctx0, inp, inp_cls);

-                if (cls) {
+                if (cls != nullptr && cls_b != nullptr) {
                    // classification head
                    // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
-                    cur = ggml_mul_mat(ctx0, cls, inp);
-                    if (cls_b) {
-                        cur = ggml_add(ctx0, cur, cls_b);
-                    }
+                    cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
                    cur = ggml_tanh(ctx0, cur);

                    // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
                    // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
                    if (cls_out) {
-                        cur = ggml_mul_mat(ctx0, cls_out, cur);
-                        if (cls_out_b) {
-                            cur = ggml_add(ctx0, cur, cls_out_b);
-                        }
+                        GGML_ASSERT(cls_out_b != nullptr);
+                        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
                    }
                } else if (cls_out) {
                    // Single layer classification head (direct projection)
                    // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
-                    cur = ggml_mul_mat(ctx0, cls_out, inp);
-                    if (cls_out_b) {
-                        cur = ggml_add(ctx0, cur, cls_out_b);
-                    }
+                    GGML_ASSERT(cls_out_b != nullptr);
+                    cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
                } else {
                    GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
                }
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -378,7 +378,7 @@ struct llm_graph_params {
    const llama_memory_state_i * mstate;
    const llama_cross          * cross;

-    uint32_t n_outputs;
+    int32_t n_outputs;

    const llm_graph_cb & cb;
 };
@@ -412,8 +412,8 @@ struct llm_graph_context {
    const float norm_eps;
    const float norm_rms_eps;

-    const int64_t n_tokens;
-    const int64_t n_outputs;
+    const int32_t n_tokens;
+    const int32_t n_outputs;
    const int32_t n_ctx_orig; // yarn

    const enum llama_pooling_type pooling_type;
--- a/src/llama-kv-cache-unified.cpp
+++ b/src/llama-kv-cache-unified.cpp
@@ -674,7 +674,6 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch

            cells.pos_set(head_cur + idx, ubatch.pos[idx]);

-            // TODO: fix indexing [UBATCH_IDX]
            for (int32_t i = 0; i < ubatch.n_seq_id[s]; i++) {
                cells.seq_add(head_cur + idx, ubatch.seq_id[s][i]);
            }
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -9,16 +9,16 @@

 #include <algorithm>
 #include <cassert>
-#include <cctype>
 #include <cfloat>
+#include <climits>
 #include <cstdarg>
 #include <cstring>
 #include <forward_list>
-#include <limits>
 #include <map>
 #include <queue>
 #include <set>
 #include <unordered_map>
+#include <cctype>

 //
 // helpers
@@ -2572,10 +2572,6 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
    // copy piece chars to output text buffer
    // skip up to 'lstrip' leading spaces before copying
    auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
-        if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
-            GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
-        }
-
        for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
            token++;
            size--;
@@ -2772,26 +2768,26 @@ void llama_vocab::impl::print_info() const {
    LLAMA_LOG_INFO("%s: n_merges         = %u\n",     __func__, (uint32_t) bpe_ranks.size());

    // special tokens
-    if (special_bos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, special_bos_id,     id_to_token.at(special_bos_id).text.c_str() );  }
-    if (special_eos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, special_eos_id,     id_to_token.at(special_eos_id).text.c_str() );  }
-    if (special_eot_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, special_eot_id,     id_to_token.at(special_eot_id).text.c_str() );  }
-    if (special_eom_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, special_eom_id,     id_to_token.at(special_eom_id).text.c_str() );  }
-    if (special_unk_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, special_unk_id,     id_to_token.at(special_unk_id).text.c_str() );  }
-    if (special_sep_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, special_sep_id,     id_to_token.at(special_sep_id).text.c_str() );  }
-    if (special_pad_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, special_pad_id,     id_to_token.at(special_pad_id).text.c_str() );  }
-    if (special_mask_id != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, special_mask_id,    id_to_token.at(special_mask_id).text.c_str() ); }
+    if (special_bos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, special_bos_id,     id_to_token[special_bos_id].text.c_str() );  }
+    if (special_eos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, special_eos_id,     id_to_token[special_eos_id].text.c_str() );  }
+    if (special_eot_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, special_eot_id,     id_to_token[special_eot_id].text.c_str() );  }
+    if (special_eom_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, special_eom_id,     id_to_token[special_eom_id].text.c_str() );  }
+    if (special_unk_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, special_unk_id,     id_to_token[special_unk_id].text.c_str() );  }
+    if (special_sep_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, special_sep_id,     id_to_token[special_sep_id].text.c_str() );  }
+    if (special_pad_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, special_pad_id,     id_to_token[special_pad_id].text.c_str() );  }
+    if (special_mask_id != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, special_mask_id,    id_to_token[special_mask_id].text.c_str() ); }

-    if (linefeed_id != LLAMA_TOKEN_NULL)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, linefeed_id,        id_to_token.at(linefeed_id).text.c_str() ); }
+    if (linefeed_id != LLAMA_TOKEN_NULL)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, linefeed_id,        id_to_token[linefeed_id].text.c_str() ); }

-    if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
-    if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
-    if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
-    if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
-    if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
-    if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
+    if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, special_fim_pre_id, id_to_token[special_fim_pre_id].text.c_str() ); }
+    if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, special_fim_suf_id, id_to_token[special_fim_suf_id].text.c_str() ); }
+    if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, special_fim_mid_id, id_to_token[special_fim_mid_id].text.c_str() ); }
+    if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, special_fim_pad_id, id_to_token[special_fim_pad_id].text.c_str() ); }
+    if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, special_fim_rep_id, id_to_token[special_fim_rep_id].text.c_str() ); }
+    if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, special_fim_sep_id, id_to_token[special_fim_sep_id].text.c_str() ); }

    for (const auto & id : special_eog_ids) {
-        LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
+        LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, id_to_token[id].text.c_str() );
    }

    LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2017,6 +2017,11 @@ struct server_context {
                params_base.n_cache_reuse = 0;
                SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
            }
+
+            if (!params_base.speculative.model.path.empty()) {
+                SRV_ERR("%s\n", "err: speculative decode is not supported by this context");
+                return false;
+            }
        }

        return true;
@@ -3217,7 +3222,7 @@ struct server_context {
                                }

                                const auto n_swa = llama_model_n_swa(model);
-                                if (pos_min > std::max(0, slot.n_past - n_swa)) {
+                                if (pos_min > slot.n_past - n_swa) {
                                    SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);
                                    SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
                                            "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
Author	SHA1	Message	Date
Georgi Gerganov	ed99a8ea04	cont : fix comments	2025-06-12 10:43:55 +03:00
Georgi Gerganov	b8b8d3f368	context : simplify output counting logic during decode ggml-ci	2025-06-12 10:35:09 +03:00
Georgi Gerganov	c53acda0b8	batch : remove logits_all flag ggml-ci	2025-06-12 10:10:45 +03:00