imatrix : avoid using imatrix.dat in README

imatrix : avoid loading model to convert or combine imatrix
imatrix : warn when writing partial data, to help guess dataset coverage
2026-03-12 14:43:22 +02:00 · 2025-07-12 16:50:10 -04:00 · 2025-07-12 16:50:10 -04:00 · 2025-07-12 16:50:10 -04:00 · 2025-07-12 15:19:51 -04:00 · 2025-07-12 13:31:19 -04:00
6 changed files with 672 additions and 161 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -448,6 +448,15 @@ void string_replace_all(std::string & s, const std::string & search, const std::
 bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
 }
+
+bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
+    bool has_suffix = string_ends_with(str, suffix);
+    if (has_suffix) {
+        str = str.substr(0, str.size() - suffix.size());
+    }
+    return has_suffix;
+}
+
 size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
    if (!str.empty() && !stop.empty()) {
        const char text_last_char = str.back();
--- a/common/common.h
+++ b/common/common.h
@@ -522,6 +522,7 @@ static bool string_starts_with(const std::string & str,

 // While we wait for C++20's std::string::ends_with...
 bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
+bool string_remove_suffix(std::string & str, const std::string_view & suffix);
 size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);

 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -233,6 +233,11 @@ class Keys:
        TYPE       = "adapter.type"
        LORA_ALPHA = "adapter.lora.alpha"

+    class IMatrix:
+        CHUNK_COUNT = "imatrix.chunk_count"
+        CHUNK_SIZE  = "imatrix.chunk_size"
+        DATASETS    = "imatrix.datasets"
+
    class Clip:
        PROJECTOR_TYPE      = "clip.projector_type"
        HAS_VISION_ENCODER  = "clip.has_vision_encoder"
@@ -282,6 +287,7 @@ class Keys:
 class GGUFType:
    MODEL   = "model"
    ADAPTER = "adapter"
+    IMATRIX = "imatrix"
    MMPROJ  = "mmproj" # dummy, unused for now


--- a/tools/imatrix/README.md
+++ b/tools/imatrix/README.md
@@ -7,14 +7,15 @@ More information is available here: https://github.com/ggml-org/llama.cpp/pull/4

 ```
 ./llama-imatrix \
-    -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
+    -m model.gguf -f some-text.txt [-o imatrix.gguf] [--process-output] \
    [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
-    [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
+    [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] \
+    [--parse-special]
 ```

 Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
 The parameters in square brackets are optional and have the following meaning:
-* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
+* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.gguf` is used.
 * `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
 * `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
 * `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
@@ -25,9 +26,9 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
 ## Example

 ```bash
-# generate importance matrix (imatrix.dat)
+# generate importance matrix (imatrix.gguf)
 ./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99

 # use the imatrix to perform a Q4_K_M quantization
-./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
+./llama-quantize --imatrix imatrix.gguf ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
 ```
--- a/tools/imatrix/imatrix.cpp
+++ b/tools/imatrix/imatrix.cpp
@@ -2,7 +2,9 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "gguf.h"

+#include <algorithm>
 #include <chrono>
 #include <cmath>
 #include <cstdio>
@@ -13,7 +15,7 @@
 #include <vector>
 #include <fstream>
 #include <unordered_map>
-#include <algorithm>
+#include <map>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -22,17 +24,20 @@
 static void print_usage(int, char ** argv) {
    LOG("\nexample usage:\n");
    LOG("\n    %s \\\n"
-            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.gguf] [--process-output] \\\n"
            "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
-            "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] \\\n"
+            "       [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] \\\n"
            "       [--parse-special]\n" , argv[0]);
    LOG("\n");
 }

+static const char * const LLM_KV_IMATRIX_DATASETS    = "imatrix.datasets";
+static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
+static const char * const LLM_KV_IMATRIX_CHUNK_SIZE  = "imatrix.chunk_size";
+
 struct Stats {
-    std::vector<float> values;
-    std::vector<int> counts;
-    int ncall = 0;
+    std::vector<float>   values;
+    std::vector<int64_t> counts;
 };

 class IMatrixCollector {
@@ -40,13 +45,16 @@ public:
    IMatrixCollector() = default;
    void set_params(common_params params) { m_params = std::move(params); }
    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
-    void save_imatrix(int ncall = -1) const;
-    bool load_imatrix(const char * fname);
+    void save_imatrix_legacy(int32_t ncall = -1) const;
+    void save_imatrix(int32_t n_chunk = -1) const;
+    bool load_imatrix_legacy(const char * fname);
+    bool load_imatrix(const char * file_name);
 private:
    std::unordered_map<std::string, Stats> m_stats;
    common_params                          m_params;
    std::mutex                             m_mutex;
-    int                                    m_last_call = 0;
+    std::vector<std::string>               m_datasets;
+    int32_t                                m_last_chunk = 0;
    std::vector<char>                      m_src1_data;
    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
 };
@@ -77,6 +85,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    const struct ggml_tensor * src1 = t->src[1];
    std::string wname = filter_tensor_name(src0->name);

+    const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
+
    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
    if (ask) {
@@ -102,14 +112,21 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
    GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));

+    // TODO: 4d? (is that even used in practice?)
+    // the extra dimension would need to be stored somewhere to be reflected in the imatrix file
+    if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
+        LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
+        GGML_ASSERT(false);
+    }
+
    // this has been adapted to the new format of storing merged experts in a single 3d tensor
    // ref: https://github.com/ggml-org/llama.cpp/pull/6387
    if (t->op == GGML_OP_MUL_MAT_ID) {
        //   ids  -> [n_experts_used, n_tokens]
        //   src1 -> [cols, n_expert_used, n_tokens]
        const ggml_tensor * ids = t->src[2];
-        const int n_as = src0->ne[2];
-        const int n_ids = ids->ne[0];
+        const int64_t n_as = src0->ne[2];
+        const int64_t n_ids = ids->ne[0];

        // the top-k selected expert ids are stored in the ids tensor
        // for simplicity, always copy ids to host, because it is small
@@ -122,23 +139,29 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *

        auto & e = m_stats[wname];

-        ++e.ncall;
-
+        if (e.counts.size() == 1 && n_as > 1) {
+            // broadcast, when loading an old imatrix
+            e.counts.resize(n_as, e.counts[0]);
+        }
        if (e.values.empty()) {
            e.values.resize(src1->ne[0]*n_as, 0);
-            e.counts.resize(src1->ne[0]*n_as, 0);
+            e.counts.resize(n_as, 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
-            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
+            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0]*n_as));
            exit(1); //GGML_ABORT("fatal error");
        }
-        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
+        else if (e.counts.size() != (size_t)n_as) {
+            LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_as);
+            exit(1); //GGML_ABORT("fatal error");
+        }
+        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
        // loop over all possible experts, regardless if they are used or not in the batch
-        for (int ex = 0; ex < n_as; ++ex) {
+        for (int64_t ex = 0; ex < n_as; ++ex) {
            size_t e_start = ex*src1->ne[0];

-            for (int idx = 0; idx < n_ids; ++idx) {
-                for (int row = 0; row < (int)src1->ne[2]; ++row) {
+            for (int64_t idx = 0; idx < n_ids; ++idx) {
+                for (int64_t row = 0; row < src1->ne[2]; ++row) {
                    const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);

                    GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
@@ -149,57 +172,73 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                    const int64_t i12 = row;
                    const float * x = (const float *)(data + i11*src1->nb[1] + i12*src1->nb[2]);

-                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
-                        e.values[e_start + j] += x[j]*x[j];
-                        e.counts[e_start + j]++;
-                        if (!std::isfinite(e.values[e_start + j])) {
-                            LOG("\n");
-                            LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
+                    e.counts[ex]++;
+
+                    for (int64_t j = 0; j < src1->ne[0]; ++j) {
+                        e.values[e_start + j] += x[j] * x[j];
+                        if (!std::isfinite((float)e.values[e_start + j])) {
+                            LOG_ERR("%f detected in %s\n", (float)e.values[e_start + j], wname.c_str());
                            exit(1);
                        }
                    }
                }
            }
-            if (e.ncall > m_last_call) {
-                m_last_call = e.ncall;
-                if (m_last_call % m_params.n_out_freq == 0) {
+            const int32_t n_chunk = e.counts[ex] / chunk_size;
+            if (n_chunk > m_last_chunk) {
+                const int32_t chunk_step = n_chunk - m_last_chunk;
+                m_last_chunk = n_chunk;
+                if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
                    save_imatrix();
                }
-                if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
-                    save_imatrix(m_last_call);
+                if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
+                    save_imatrix(m_last_chunk);
                }
            }
        }
    } else {
        auto & e = m_stats[wname];
+        const int64_t n_mat = src1->ne[2] * src1->ne[3];
+
        if (e.values.empty()) {
-            e.values.resize(src1->ne[0], 0);
-            e.counts.resize(src1->ne[0], 0);
+            e.values.resize(src1->ne[0] * n_mat, 0);
+            e.counts.resize(n_mat, 0);
        }
-        else if (e.values.size() != (size_t)src1->ne[0]) {
-            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
+        else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) {
+            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat));
            exit(1); //GGML_ABORT("fatal error");
        }
-        ++e.ncall;
-        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
-        for (int row = 0; row < (int)src1->ne[1]; ++row) {
-            const float * x = (const float *) (data + row * src1->nb[1]);
-            for (int j = 0; j < (int)src1->ne[0]; ++j) {
-                e.values[j] += x[j]*x[j];
-                e.counts[j]++;
-                if (!std::isfinite(e.values[j])) {
-                    LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
-                    exit(1);
-                }
-            }
+        else if (e.counts.size() != (size_t)n_mat) {
+            LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_mat);
+            exit(1); //GGML_ABORT("fatal error");
        }
-        if (e.ncall > m_last_call) {
-            m_last_call = e.ncall;
-            if (m_last_call % m_params.n_out_freq == 0) {
-                save_imatrix();
-            }
-            if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
-                save_imatrix(m_last_call);
+        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type);
+        for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) {
+            for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) {
+                const int64_t mat_id = i3 * src1->ne[2] + i2;
+                const int64_t mat_start = mat_id * src1->ne[0];
+
+                for (int64_t row = 0; row < src1->ne[1]; ++row) {
+                    const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]);
+                    e.counts[mat_id]++;
+                    for (int64_t j = 0; j < src1->ne[0]; ++j) {
+                        e.values[mat_start + j] += x[j] * x[j];
+                        if (!std::isfinite((float)e.values[j])) {
+                            LOG_ERR("%f detected in %s\n", (float)e.values[j], wname.c_str());
+                            exit(1);
+                        }
+                    }
+                }
+                const int32_t n_chunk = e.counts[mat_id] / chunk_size;
+                if (n_chunk > m_last_chunk) {
+                    const int32_t chunk_step = n_chunk - m_last_chunk;
+                    m_last_chunk = n_chunk;
+                    if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
+                        save_imatrix();
+                    }
+                    if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
+                        save_imatrix(m_last_chunk);
+                    }
+                }
            }
        }
    }
@@ -207,7 +246,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    return true;
 }

-void IMatrixCollector::save_imatrix(int ncall) const {
+void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const {
    auto fname = m_params.out_file;

    if (ncall > 0) {
@@ -215,7 +254,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
        fname += std::to_string(ncall);
    }

-    // avoid writing imatrix entries that do not have full data
+    // warn when writing imatrix entries that do not have full data
    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data

    int n_entries = 0;
@@ -247,8 +286,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
        }

        if (n_zeros > 0) {
-            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
-            continue;
+            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
        }

        n_entries++;
@@ -259,93 +297,378 @@ void IMatrixCollector::save_imatrix(int ncall) const {
        LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
    }

+    // deterministic tensor name order
+    std::sort(to_store.begin(), to_store.end());
+
+    const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
+
    std::ofstream out(fname, std::ios::binary);
    out.write((const char *) &n_entries, sizeof(n_entries));
    for (const auto & name : to_store) {
        const auto & stat = m_stats.at(name);
-        int len = name.size();
+        const int32_t len = name.size();
        out.write((const char *) &len, sizeof(len));
        out.write(name.c_str(), len);
-        out.write((const char *) &stat.ncall, sizeof(stat.ncall));
-        int nval = stat.values.size();
+        // ceiling division to avoid accidental zeros
+        const int32_t ncall = (*std::max_element(stat.counts.begin(), stat.counts.end()) + (chunk_size - 1)) / chunk_size;
+        out.write((const char *) &ncall, sizeof(ncall));
+        const int32_t nval = stat.values.size();
+        const int32_t nmat = stat.counts.size();
        out.write((const char *) &nval, sizeof(nval));
-        if (nval > 0) {
+        if (nval > 0 && nmat > 0) {
            std::vector<float> tmp(nval);
-            for (int i = 0; i < nval; i++) {
-                tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
+            for (int32_t i = 0; i < nval; i++) {
+                float count = static_cast<float>(stat.counts[i / (nval / nmat)]);
+                float value = stat.values[i];
+                if (count == 0.0f) {
+                    // store 1 for partial data
+                    value = 1.0f;
+                    count = 1.0f;
+                }
+                tmp[i] = (value / count) * static_cast<float>(ncall);
            }
-            out.write((const char*)tmp.data(), nval*sizeof(float));
+            out.write((const char *) tmp.data(), nval * sizeof(float));
        }
    }

    // Write the number of call the matrix was computed with
-    out.write((const char *) &m_last_call, sizeof(m_last_call));
+    out.write((const char *) &m_last_chunk, sizeof(m_last_chunk));

    // Write the input filename at the end of the file to later on specify it in quantize
    {
-        int len = m_params.prompt_file.size();
+        const char * dataset_file = m_params.prompt_file.c_str();
+        int32_t len = m_params.prompt_file.size();
+        // When there is no prompt but there were other imatrix files loaded, use the last dataset
+        if (m_params.prompt_file.empty() && !m_datasets.empty()) {
+            const std::string & dataset_str = m_datasets[m_datasets.size() - 1];
+            dataset_file = dataset_str.c_str();
+            len = dataset_str.size();
+        }
        out.write((const char *) &len, sizeof(len));
-        out.write(m_params.prompt_file.c_str(), len);
+        out.write(dataset_file, len);
    }

    LOGV(1, "\n");
-    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
+    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
 }

-bool IMatrixCollector::load_imatrix(const char * fname) {
+void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
+    auto fname = m_params.out_file;
+
+    // TODO: use the new format in more cases
+    if (!string_ends_with(fname, ".gguf")) {
+        LOG_WRN("\n%s: saving to legacy imatrix format because output suffix is not .gguf\n", __func__);
+        this->save_imatrix_legacy(n_chunk);
+        return;
+    }
+
+    if (n_chunk > 0) {
+        fname += ".at_";
+        fname += std::to_string(n_chunk);
+    }
+
+    // write imatrix entries even if they don't have full data. (can be corrected when reading)
+    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
+
+    std::vector<std::string> to_store;
+    size_t data_size = 0;
+
+    bool is_first = true; // for printing
+    for (const auto & kv : m_stats) {
+        const int n_all = kv.second.counts.size();
+
+        int n_zeros = 0;
+        for (const auto c : kv.second.counts) {
+            if (c == 0) {
+                n_zeros++;
+            }
+        }
+
+        if (n_zeros != 0 && is_first) {
+            LOG_INF("\n");
+            is_first = false;
+        }
+
+        if (n_zeros > 0) {
+            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+        }
+
+        to_store.push_back(kv.first);
+        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.values.size(), GGML_MEM_ALIGN);
+        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.counts.size(), GGML_MEM_ALIGN);
+    }
+
+    // deterministic tensor name order
+    std::sort(to_store.begin(), to_store.end());
+
+    struct ggml_init_params params = {
+        /* .mem_size   = */ data_size,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+    struct ggml_context * ctx = ggml_init(params);
+    struct gguf_context * ctx_gguf = gguf_init_empty();
+
+    {
+        std::vector<const char *> datasets;
+        datasets.reserve(m_datasets.size() + 1);
+        for (size_t i = 0; i < m_datasets.size(); ++i) {
+            datasets.push_back(m_datasets[i].c_str());
+        }
+        if (!m_params.prompt_file.empty()) {
+            datasets.push_back(m_params.prompt_file.c_str());
+        }
+
+        gguf_set_val_str(ctx_gguf, "general.type", "imatrix");
+        // Write the dataset paths
+        gguf_set_arr_str(ctx_gguf, LLM_KV_IMATRIX_DATASETS, datasets.data(), datasets.size());
+        // Write the number of chunks the matrix was computed with
+        gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT, m_last_chunk);
+        gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE, m_params.n_ctx / m_params.n_parallel);
+    }
+
+    for (const auto & name : to_store) {
+        const auto & stat = m_stats.at(name);
+        const int32_t nval = (int32_t) stat.values.size();
+        const int32_t nmat = (int32_t) stat.counts.size();
+        if (nval > 0 && nmat > 0) {
+            struct ggml_tensor * in_sum2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nval / nmat, nmat);
+            struct ggml_tensor * counts  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, nmat);
+            ggml_format_name(in_sum2, "%s.in_sum2", name.c_str());
+            ggml_format_name(counts, "%s.counts", name.c_str());
+
+            for (int32_t j = 0; j < nval; ++j) {
+                ((float *) in_sum2->data)[j] = (float) stat.values[j];
+            }
+            for (int32_t j = 0; j < nmat; ++j) {
+                ((float *) counts->data)[j] = (float) stat.counts[j];
+            }
+
+            gguf_add_tensor(ctx_gguf, in_sum2);
+            gguf_add_tensor(ctx_gguf, counts);
+        }
+    }
+
+    gguf_write_to_file(ctx_gguf, fname.c_str(), false);
+
+    LOGV(1, "\n");
+    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
+
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
+}
+
+bool IMatrixCollector::load_imatrix_legacy(const char * fname) {
    std::ifstream in(fname, std::ios::binary);
    if (!in) {
-        LOG_ERR("%s: failed to open %s\n",__func__, fname);
+        LOG_ERR("%s: failed to open %s\n", __func__, fname);
        return false;
    }
    int n_entries;
-    in.read((char*)&n_entries, sizeof(n_entries));
+    in.read((char *) &n_entries, sizeof(n_entries));
    if (in.fail() || n_entries < 1) {
        LOG_ERR("%s: no data in file %s\n", __func__, fname);
        return false;
    }
+    // Guess the chunk size because it's not stored in the file
+    const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
+
    for (int i = 0; i < n_entries; ++i) {
-        int len; in.read((char *)&len, sizeof(len));
-        std::vector<char> name_as_vec(len+1);
-        in.read((char *)name_as_vec.data(), len);
+        int32_t len = 0;
+        in.read((char *) &len, sizeof(len));
+        std::vector<char> name_as_vec(len + 1);
+        in.read((char *) name_as_vec.data(), len);
        if (in.fail()) {
-            LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
+            LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + 1, fname);
            return false;
        }
        name_as_vec[len] = 0;
-        std::string name{name_as_vec.data()};
+        std::string name{ name_as_vec.data() };
        auto & e = m_stats[std::move(name)];
-        int ncall;
-        in.read((char*)&ncall, sizeof(ncall));
-        int nval;
-        in.read((char *)&nval, sizeof(nval));
+        int32_t ncall = 0;
+        in.read((char *) &ncall, sizeof(ncall));
+        int32_t nval = 0;
+        in.read((char *) &nval, sizeof(nval));
        if (in.fail() || nval < 1) {
-            LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
+            LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i);
            m_stats = {};
            return false;
        }

        if (e.values.empty()) {
-            e.values.resize(nval, 0);
-            e.counts.resize(nval, 0);
+            e.values.resize(nval, 0.0f);
+            e.counts.resize(1, 0);
        }

        std::vector<float> tmp(nval);
-        in.read((char*)tmp.data(), nval*sizeof(float));
+        in.read((char *) tmp.data(), nval * sizeof(float));
        if (in.fail()) {
-            LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
+            LOG_ERR("%s: failed reading data for entry %d\n", __func__, i);
            m_stats = {};
            return false;
        }

-        // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
+        // Recreate the state as expected by save_imatrix(), and correct for weighted sum.
        for (int i = 0; i < nval; i++) {
-            e.values[i] += tmp[i];
-            e.counts[i] += ncall;
+            e.values[i] += tmp[i] * chunk_size;
+        }
+        // The legacy format doesn't distinguish the counts for different experts
+        for (size_t j = 0; j < e.counts.size(); ++j) {
+            e.counts[j] += ncall * chunk_size;
        }
-        e.ncall += ncall;
-
    }
+
+    {
+        // TODO: extract into its own method; this is also used by the GGUF-based format
+        // Calculate the last chunk count
+        int64_t max_count = 0;
+        for (const auto & stats : m_stats) {
+            for (int64_t count : stats.second.counts) {
+                if (count > max_count) {
+                    max_count = count;
+                }
+            }
+        }
+        m_last_chunk = max_count / (chunk_size);
+    }
+
+    {
+        // Read the number of calls the matrix was computed with
+        int32_t n_calls;
+        in.read((char *) &n_calls, sizeof(n_calls));
+        // ignore it because it's not important
+    }
+
+    // Read the dataset path to include it when writing to GGUF
+    if (!in.fail()){
+        int32_t len = 0;
+        in.read((char *) &len, sizeof(len));
+        if (!in.fail()) {
+            std::vector<char> dataset;
+            dataset.resize(len + 1, 0);
+            in.read(dataset.data(), len);
+            if (!in.fail()) {
+                m_datasets.push_back(dataset.data());
+            }
+        }
+    }
+
+    return true;
+}
+
+// Using GGUF as the file format, for greater extensibility
+bool IMatrixCollector::load_imatrix(const char * file_name) {
+    struct ggml_context * ctx = nullptr;
+    struct gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ false, // the data is needed
+        /* .ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(file_name, meta_gguf_params);
+    if (!ctx_gguf) {
+        return this->load_imatrix_legacy(file_name);
+    }
+    const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
+    if (n_entries < 1) {
+        LOG_ERR("%s: no data in file %s\n", __func__, file_name);
+        gguf_free(ctx_gguf);
+        ggml_free(ctx);
+        return false;
+    }
+
+    const int64_t datasets_key = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS);
+    if (datasets_key != -1 && gguf_get_arr_type(ctx_gguf, datasets_key) == GGUF_TYPE_STRING) {
+        const int64_t n = gguf_get_arr_n(ctx_gguf, datasets_key);
+        m_datasets.reserve(m_datasets.size() + n);
+        for (int64_t i = 0; i < n; ++i) {
+            m_datasets.push_back(gguf_get_arr_str(ctx_gguf, datasets_key, i));
+        }
+    }
+
+    const std::string in_sum2_suffix{ ".in_sum2" };
+    const std::string counts_suffix{ ".counts" };
+
+    // Could re-use m_stats instead, but this allows
+    // checking for completeness of *each* loaded imatrix file
+    // and also makes it easier to re-use a similar implementation in quantize.cpp
+    // Using an ordered map to get a deterministic iteration order.
+    std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+
+    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+        std::string name = cur->name;
+
+        if (name.empty()) { continue; }
+
+        if (string_remove_suffix(name, in_sum2_suffix)) {
+            // in_sum2
+            sums_counts_for[std::move(name)].first = cur;
+        } else if (string_remove_suffix(name, counts_suffix)) {
+            // counts
+            sums_counts_for[std::move(name)].second = cur;
+        } else {
+            // ignore other tensors
+        }
+    }
+
+    for (const auto & sc : sums_counts_for) {
+        const std::string &        name    = sc.first;
+        const struct ggml_tensor * in_sum2 = sc.second.first;
+        const struct ggml_tensor * counts  = sc.second.second;
+
+        if (!in_sum2 || !counts) {
+            LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str());
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            return false;
+        }
+
+        auto & e = m_stats[name];
+
+        int64_t nval = ggml_nelements(in_sum2);
+        if (e.values.empty()) {
+            e.values.resize(nval, 0.0f);
+        } else if ((size_t) nval != e.values.size()) {
+            LOG_ERR("%s: mismatched sums size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) nval, e.values.size());
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            return false;
+        }
+
+        int64_t ncounts = ggml_nelements(counts);
+        if (e.counts.empty()) {
+            e.counts.resize(ncounts, 0);
+        } else if (e.counts.size() == 1 && ncounts > 1) {
+            // broadcast, when loading an old imatrix
+            e.counts.resize(ncounts, e.counts[0]);
+        } else if ((size_t) ncounts != e.counts.size()) {
+            LOG_ERR("%s: mismatched counts size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) ncounts, e.counts.size());
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            return false;
+        }
+
+        // Recreate the state as expected by save_imatrix()
+        for (int64_t j = 0; j < nval; j++) {
+            e.values[j] += ((const float *) in_sum2->data)[j];
+        }
+        for (int64_t j = 0; j < ncounts; j++) {
+            e.counts[j] += std::lround(((const float *) counts->data)[j]);
+        }
+    }
+
+    // TODO: extract into its own method; this is also used by the legacy format
+    // Calculate the last chunk count
+    int64_t max_count = 0;
+    for (const auto & stats : m_stats) {
+        for (int64_t count : stats.second.counts) {
+            if (count > max_count) {
+                max_count = count;
+            }
+        }
+    }
+    m_last_chunk = max_count / (m_params.n_ctx / m_params.n_parallel);
+
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
    return true;
 }

@@ -428,12 +751,11 @@ static void process_logits(
    }
 }

-static bool compute_imatrix(llama_context * ctx, const common_params & params) {
+static bool compute_imatrix(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
    const llama_model * model = llama_get_model(ctx);
    const llama_vocab * vocab = llama_model_get_vocab(model);

    const bool add_bos = llama_vocab_get_add_bos(vocab);
-    const int n_ctx = llama_n_ctx(ctx);

    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));

@@ -478,45 +800,61 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
    double nll = 0.0;
    double nll2 = 0.0;

-    LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
-
-    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
-
    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+    const int n_seq = std::max(1, n_batch / n_ctx);
+
+    GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0);
+    GGML_ASSERT(params.n_ctx == n_seq * n_ctx);
+
+    llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);

    std::vector<float> logits;
    if (params.compute_ppl && num_batches > 1) {
        logits.reserve((size_t)n_ctx * n_vocab);
    }

-    for (int i = 0; i < n_chunk; ++i) {
+    LOG_INF("%s: computing over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
+
+    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
+
+    for (int i = 0; i < n_chunk; i += n_seq) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;

-        std::vector<float> logits;
+        const int n_seq_batch = std::min(n_seq, n_chunk - i);

        const auto t_start = std::chrono::high_resolution_clock::now();

        // clear the KV cache
        llama_memory_clear(llama_get_memory(ctx), true);

-        llama_batch batch = llama_batch_init(n_batch, 0, 1);
-
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);

-            // save original token and restore it after eval
-            const auto token_org = tokens[batch_start];
-
-            // add BOS token for the first batch of each chunk
-            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_vocab_bos(vocab);
-            }
-
+            // clear the batch
            common_batch_clear(batch);
-            for (int i = 0; i < batch_size; i++) {
-                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+
+            for (int seq = 0; seq < n_seq_batch; seq++) {
+                int seq_start = batch_start + seq*n_ctx;
+
+                // save original token and restore it after eval
+                const auto token_org = tokens[seq_start];
+
+                // add BOS token for the first batch of each chunk
+                if (add_bos && j == 0) {
+                    tokens[seq_start] = llama_vocab_bos(vocab);
+                }
+                for (int k = 0; k < batch_size; ++k) {
+                    // NOTE: specifying all logits to get activations for the output.weight tensor
+                    //       and also for the perplexity calculation.
+                    // TODO: only get outputs when (params.process_output || params.compute_ppl)
+                    //       (not possible when this skips FFN computation of the last layer)
+                    common_batch_add(batch, tokens[seq_start + k], j*n_batch + k, { seq }, true);
+                }
+
+                // restore the original token in case it was set to BOS
+                tokens[seq_start] = token_org;
            }

            if (llama_decode(ctx, batch)) {
@@ -525,23 +863,19 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
                return false;
            }

-            // restore the original token in case it was set to BOS
-            tokens[batch_start] = token_org;
-
            if (params.compute_ppl && num_batches > 1) {
                const auto * batch_logits = llama_get_logits(ctx);
                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
            }
        }

-        llama_batch_free(batch);
-
-        const auto t_end = std::chrono::high_resolution_clock::now();

        if (i == 0) {
+            llama_synchronize(ctx);
+            const auto t_end = std::chrono::high_resolution_clock::now();
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
-            int total_seconds = (int)(t_total * n_chunk);
+            int total_seconds = (int)(t_total * n_chunk / n_seq);
            if (total_seconds >= 60*60) {
                LOG("%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
@@ -551,17 +885,27 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {

        if (params.compute_ppl) {
            const int first = n_ctx/2;
-            const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
-            count += n_ctx - first - 1;
+            for (int seq = 0; seq < n_seq_batch; seq++) {
+                const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx);

-            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
+                llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
+
+                process_logits(n_vocab, all_logits + first*n_vocab,
+                        tokens_data, n_ctx - 1 - first,
+                        workers, nll, nll2,
+                        logit_history.data() + start + seq*n_ctx + first,
+                        prob_history.data()  + start + seq*n_ctx + first);
+
+                count += n_ctx - first - 1;
+
+                LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
+            }
            fflush(stdout);

            logits.clear();
        }
    }
+
    LOG("\n");

    if (params.compute_ppl) {
@@ -577,13 +921,15 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
        }
    }

+    llama_batch_free(batch);
+
    return true;
 }

 int main(int argc, char ** argv) {
    common_params params;

-    params.out_file = "imatrix.dat" ;
+    params.out_file = "imatrix.gguf";

    params.n_ctx = 512;
    params.escape = false;
@@ -594,7 +940,22 @@ int main(int argc, char ** argv) {

    common_init();

-    params.n_batch = std::min(params.n_batch, params.n_ctx);
+    const int32_t n_ctx = params.n_ctx;
+
+    if (n_ctx <= 0) {
+        LOG_ERR("%s: imatrix tool requires '--ctx-size' > 0\n", __func__);
+        return 1;
+    }
+
+    {
+        const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
+        const int32_t n_kv = n_seq * n_ctx;
+
+        params.n_parallel = n_seq;
+        params.n_ctx      = n_kv;
+
+        params.n_batch = std::min(params.n_batch, n_kv);
+    }

    g_collector.set_params(params);

@@ -606,9 +967,23 @@ int main(int argc, char ** argv) {
        }
    }

-    if (params.in_files.size() > 1) {
-        LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
+    if (params.prompt.empty()) {
+        LOG_INF("No prompt provided; combining precomputed matrices only.\n");
+
+        if (params.in_files.empty()) {
+            LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
+            return 1;
+        }
+
+        if (params.in_files.size() == 1) {
+            LOG_INF("%s : saving imatrix to '%s'\n", __func__, params.out_file.c_str());
+        } else if (params.in_files.size() > 1) {
+            LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
+        }
+
        g_collector.save_imatrix();
+
+        return 0;
    }

    llama_backend_init();
@@ -643,19 +1018,10 @@ int main(int argc, char ** argv) {
        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
    }

-    if (params.prompt.empty()) {
-        if (params.in_files.empty()) {
-            LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
-            return 1;
-        }
-        LOG_INF("No prompt provided; combining precomputed matrices only.\n");
-    } else {
-        if (!compute_imatrix(ctx, params)) {
-            return 1;
-        }
+    if (!compute_imatrix(ctx, params, n_ctx)) {
+        return 1;
    }

-
    g_collector.save_imatrix();

    LOG("\n");
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -1,11 +1,13 @@
 #include "common.h"
 #include "llama.h"
+#include "gguf.h"

 #include <cstdio>
 #include <cstring>
 #include <vector>
 #include <string>
 #include <unordered_map>
+#include <map>
 #include <fstream>
 #include <cmath>
 #include <cctype>
@@ -68,6 +70,11 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS   = "quantize.imatrix.chunks_count";

+// TODO: share with imatrix.cpp
+static const char * const LLM_KV_IMATRIX_DATASETS    = "imatrix.datasets";
+static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
+static const char * const LLM_KV_IMATRIX_CHUNK_SIZE  = "imatrix.chunk_size";
+
 static bool striequals(const char * a, const char * b) {
    while (*a && *b) {
        if (std::tolower(*a) != std::tolower(*b)) {
@@ -84,7 +91,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
    for (auto ch : ftype_str_in) {
        ftype_str.push_back(std::toupper(ch));
    }
-    for (auto & it : QUANT_OPTIONS) {
+    for (const auto & it : QUANT_OPTIONS) {
        if (striequals(it.name.c_str(), ftype_str.c_str())) {
            ftype = it.ftype;
            ftype_str_out = it.name;
@@ -93,7 +100,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
    }
    try {
        int ftype_int = std::stoi(ftype_str);
-        for (auto & it : QUANT_OPTIONS) {
+        for (const auto & it : QUANT_OPTIONS) {
            if (it.ftype == ftype_int) {
                ftype = it.ftype;
                ftype_str_out = it.name;
@@ -129,7 +136,7 @@ static void usage(const char * executable) {
    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
    printf("Note: --include-weights and --exclude-weights cannot be used together\n");
    printf("\nAllowed quantization types:\n");
-    for (auto & it : QUANT_OPTIONS) {
+    for (const auto & it : QUANT_OPTIONS) {
        if (it.name != "COPY") {
            printf("  %2d  or  ", it.ftype);
        } else {
@@ -140,7 +147,7 @@ static void usage(const char * executable) {
    exit(1);
 }

-static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+static int load_legacy_imatrix(const std::string & imatrix_file, std::vector<std::string> & imatrix_datasets, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
    std::ifstream in(imatrix_file.c_str(), std::ios::binary);
    if (!in) {
        printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
@@ -180,7 +187,9 @@ static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_
            exit(1);
        }
        if (ncall > 0) {
-            for (auto& v : e) v /= ncall;
+            for (auto & v : e) {
+                v /= ncall;
+            }
        }

        if (getenv("LLAMA_TRACE")) {
@@ -188,7 +197,7 @@ static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_
        }
    }

-    // latest imatrix version contains the dataset filename at the end of the file
+    // latest legacy imatrix version contains the dataset filename at the end of the file
    int m_last_call = 0;
    if (in.peek() != EOF) {
        in.read((char *)&m_last_call, sizeof(m_last_call));
@@ -196,15 +205,130 @@ static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_
        in.read((char *)&dataset_len, sizeof(dataset_len));
        std::vector<char> dataset_as_vec(dataset_len);
        in.read(dataset_as_vec.data(), dataset_len);
-        imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
-        printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
+        imatrix_datasets.resize(1);
+        imatrix_datasets[0].assign(dataset_as_vec.begin(), dataset_as_vec.end());
+        printf("%s: imatrix dataset='%s'\n", __func__, imatrix_datasets[0].c_str());
    }
    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
    return m_last_call;
 }

+static int load_imatrix(const std::string & imatrix_file, std::vector<std::string> & imatrix_datasets, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+
+    struct ggml_context * ctx = nullptr;
+    struct gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ false, // the data is needed
+        /* .ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params);
+    if (!ctx_gguf) {
+        fprintf(stderr, "%s: imatrix file '%s' is using old format\n", __func__, imatrix_file.c_str());
+        return load_legacy_imatrix(imatrix_file, imatrix_datasets, imatrix_data);
+    }
+    const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
+    if (n_entries < 1) {
+        fprintf(stderr, "%s: no data in file %s\n", __func__, imatrix_file.c_str());
+        gguf_free(ctx_gguf);
+        ggml_free(ctx);
+        exit(1);
+    }
+
+    const int dataset_idx     = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS);
+    const int chunk_count_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT);
+    const int chunk_size_idx  = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE);
+    if (dataset_idx < 0 || chunk_count_idx < 0 || chunk_size_idx < 0) {
+        fprintf(stderr, "%s: missing imatrix metadata in file %s\n", __func__, imatrix_file.c_str());
+        gguf_free(ctx_gguf);
+        ggml_free(ctx);
+        exit(1);
+    }
+
+    const uint32_t chunk_size = gguf_get_val_u32(ctx_gguf, chunk_size_idx);
+
+    const std::string sums_suffix{ ".in_sum2" };
+    const std::string counts_suffix{ ".counts" };
+
+    // Using an ordered map to get a deterministic iteration order.
+    std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+
+    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+        std::string name = cur->name;
+
+        if (name.empty()) { continue; }
+
+        if (string_remove_suffix(name, sums_suffix)) {
+            // in_sum2
+            sums_counts_for[std::move(name)].first = cur;
+        } else if (string_remove_suffix(name, counts_suffix)) {
+            // counts
+            sums_counts_for[std::move(name)].second = cur;
+        } else {
+            // ignore other tensors
+        }
+    }
+
+    for (const auto & sc : sums_counts_for) {
+        const        std::string & name   = sc.first;
+        const struct ggml_tensor * sums   = sc.second.first;
+        const struct ggml_tensor * counts = sc.second.second;
+
+        if (!sums || !counts) {
+            fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str());
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            exit(1);
+        }
+
+        const int64_t ne0 = sums->ne[0];
+        const int64_t ne1 = sums->ne[1];
+
+        auto & e = imatrix_data[name];
+        e.resize(ggml_nelements(sums));
+        float max_count = 0.0f;
+        for (int64_t j = 0; j < ne1; ++j) {
+            const float count = ((const float *) counts->data)[j];
+            if (count > 0.0f) {
+                for (int64_t i = 0; i < ne0; ++i) {
+                    e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count;
+                }
+            } else {
+                // Partial imatrix data, this tensor never got any input during calibration
+                for (int64_t i = 0; i < ne0; ++i) {
+                    e[j*ne0 + i] = 1;
+                }
+            }
+            if (count > max_count) {
+                max_count = count;
+            }
+        }
+        if (getenv("LLAMA_TRACE")) {
+            printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n", __func__, int(e.size()), int(max_count), int(max_count / chunk_size), name.c_str());
+        }
+    }
+
+    int m_last_chunk = gguf_get_val_u32(ctx_gguf, chunk_count_idx);
+
+    int64_t n_datasets = gguf_get_arr_n(ctx_gguf, dataset_idx);
+    imatrix_datasets.reserve(n_datasets);
+    for (int64_t i = 0; i < n_datasets; ++i) {
+        imatrix_datasets.push_back(gguf_get_val_str(ctx_gguf, dataset_idx));
+    }
+    printf("%s: imatrix datasets=['%s'", __func__, imatrix_datasets[0].c_str());
+    for (size_t i = 1; i < imatrix_datasets.size(); ++i) {
+        printf(", '%s'", imatrix_datasets[i].c_str());
+    }
+    printf("]\n");
+
+    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk);
+
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
+
+    return m_last_chunk;
+}
+
 static int prepare_imatrix(const std::string & imatrix_file,
-        std::string & imatrix_dataset,
+        std::vector<std::string> & imatrix_dataset,
        const std::vector<std::string> & included_weights,
        const std::vector<std::string> & excluded_weights,
        std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
@@ -216,18 +340,21 @@ static int prepare_imatrix(const std::string & imatrix_file,
        return m_last_call;
    }
    if (!excluded_weights.empty()) {
-        for (auto& name : excluded_weights) {
-            for (auto it = imatrix_data.begin(); it != imatrix_data.end(); ) {
+        for (const auto & name : excluded_weights) {
+            for (auto it = imatrix_data.begin(); it != imatrix_data.end();) {
                auto pos = it->first.find(name);
-                if (pos != std::string::npos) it = imatrix_data.erase(it);
-                else ++it;
+                if (pos != std::string::npos) {
+                    it = imatrix_data.erase(it);
+                } else {
+                    ++it;
+                }
            }
        }
    }
    if (!included_weights.empty()) {
        std::unordered_map<std::string, std::vector<float>> tmp;
-        for (auto& name : included_weights) {
-            for (auto& e : imatrix_data) {
+        for (const auto & name : included_weights) {
+            for (auto & e : imatrix_data) {
                auto pos = e.first.find(name);
                if (pos != std::string::npos) {
                    tmp.emplace(std::move(e));
@@ -396,9 +523,9 @@ int main(int argc, char ** argv) {
        usage(argv[0]);
    }

-    std::string imatrix_dataset;
+    std::vector<std::string> imatrix_datasets;
    std::unordered_map<std::string, std::vector<float>> imatrix_data;
-    int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
+    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data);
    if (!imatrix_data.empty()) {
        params.imatrix = &imatrix_data;
        {
@@ -409,11 +536,12 @@ int main(int argc, char ** argv) {
            kvo.val_str[127] = '\0';
            kv_overrides.emplace_back(std::move(kvo));
        }
-        if (!imatrix_dataset.empty()) {
+        if (!imatrix_datasets.empty()) {
            llama_model_kv_override kvo;
+            // TODO: list multiple datasets when there are more than one
            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
-            strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
+            strncpy(kvo.val_str, imatrix_datasets[0].c_str(), 127);
            kvo.val_str[127] = '\0';
            kv_overrides.emplace_back(std::move(kvo));
        }
Author	SHA1	Message	Date
Francis Couture-Harpin	942c55cd57	imatrix : avoid using imatrix.dat in README Some checks failed Python check requirements.txt / check-requirements (push) Has been cancelled Python Type-Check / pyright type-check (push) Has been cancelled Update Operations Documentation / update-ops-docs (push) Has been cancelled	2025-07-12 16:50:10 -04:00
Francis Couture-Harpin	183eeb5518	imatrix : avoid loading model to convert or combine imatrix	2025-07-12 16:50:10 -04:00
Francis Couture-Harpin	50f53b3e40	imatrix : warn when writing partial data, to help guess dataset coverage Also make the legacy format store partial data by using neutral values for missing data. This matches what is done at read-time for the new format, and so should get the same quality in case the old format is still used.	2025-07-12 16:50:10 -04:00
Francis Couture-Harpin	42423ec4d3	imatrix : add warning when legacy format is written	2025-07-12 15:19:51 -04:00
Francis Couture-Harpin	0ee322cd0f	Merge branch 'master' into compilade/imatrix-batched-chunks	2025-07-12 13:31:19 -04:00
Francis Couture-Harpin	e33de128c7	common : move string_remove_suffix from quantize and imatrix Some checks failed Python check requirements.txt / check-requirements (push) Has been cancelled Python Type-Check / pyright type-check (push) Has been cancelled Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2025-06-23 16:24:06 -04:00
Francis Couture-Harpin	118d52fefc	Merge branch 'master' into compilade/imatrix-batched-chunks	2025-06-23 12:54:56 -04:00
Francis Couture-Harpin	0e79355075	quantize : fix dataset name loading from gguf imatrix	2025-06-23 12:43:25 -04:00
Francis Couture-Harpin	43cd2b3eb5	imatrix : support 3d tensors with MUL_MAT	2025-06-23 12:20:55 -04:00
Francis Couture-Harpin	1a9454a3d2	imatrix : avoid returning from void function save_imatrix	2025-06-18 16:44:41 -04:00
Francis Couture-Harpin	ba6f6be6ce	imatrix : don't use FMA explicitly This should make comparisons between the formats easier because this matches the behavior of the previous version.	2025-06-18 16:33:37 -04:00
Francis Couture-Harpin	2c0945027a	Merge branch 'master' into compilade/imatrix-batched-chunks	2025-06-18 16:32:35 -04:00
Francis Couture-Harpin	1d19025909	imatrix : use the function name in more error messages	2025-04-15 17:52:36 -04:00
Francis Couture-Harpin	635f945ed1	convert : remove imatrix to gguf python script	2025-04-15 17:42:26 -04:00
Francis Couture-Harpin	a5165a6ca9	imatrix : two-way conversion between old format and GGUF	2025-04-15 17:37:21 -04:00
Francis Couture-Harpin	16202d6f96	Merge branch 'master' into compilade/imatrix-batched-chunks	2025-04-13 12:10:02 -04:00
Francis Couture-Harpin	1be357d990	Merge branch 'master' into compilade/imatrix-batched-chunks Some checks failed Python check requirements.txt / check-requirements (push) Has been cancelled Python Type-Check / pyright type-check (push) Has been cancelled	2025-02-09 12:06:24 -05:00
Francis Couture-Harpin	db502ddd0e	Merge branch 'master' into compilade/imatrix-batched-chunks	2025-02-09 12:06:15 -05:00
Francis Couture-Harpin	c7a32e761d	common : use GGUF for imatrix output by default	2025-01-30 19:56:20 -05:00
Francis Couture-Harpin	2d79a7077c	quantize : use unused imatrix chunk_size with LLAMA_TRACE Some checks failed flake8 Lint / Lint (push) Has been cancelled	2024-09-10 12:09:17 -04:00
Francis Couture-Harpin	8c13e16bb0	imatrix : allow loading mis-ordered tensors Sums and counts tensors no longer need to be consecutive. * imatrix : more sanity checks when loading multiple imatrix files * imatrix : use ggml_format_name instead of std::string concatenation Co-authored-by: Xuan Son Nguyen <son@huggingface.co>	2024-09-10 11:51:23 -04:00
Francis Couture-Harpin	2217247051	imatrix : remove unused n_entries Some checks failed flake8 Lint / Lint (push) Waiting to run Python check requirements.txt / check-requirements (push) Has been cancelled Python Type-Check / pyright type-check (push) Has been cancelled	2024-09-09 22:35:47 -04:00
Francis Couture-Harpin	efa9186dc8	imatrix : avoid using designated initializers in C++	2024-09-09 22:33:10 -04:00
Francis Couture-Harpin	894ed8d7b6	py : include imatrix converter requirements in toplevel requirements	2024-09-09 22:20:18 -04:00
Francis Couture-Harpin	9e6b0e9419	perplexity : revert changes	2024-09-09 22:00:37 -04:00
Francis Couture-Harpin	503630e88a	py : add requirements for legacy imatrix convert script	2024-09-09 21:56:04 -04:00
Francis Couture-Harpin	d19101c9a0	imatrix : use FMA and sort tensor names	2024-09-08 11:03:59 -04:00
Francis Couture-Harpin	3ad0603c65	Merge branch 'master' into compilade/imatrix-batched-chunks	2024-09-08 10:05:08 -04:00
Francis Couture-Harpin	c8ab6a3ba3	imatrix : fix conversion problems	2024-09-08 10:04:01 -04:00
Francis Couture-Harpin	3de9300c37	imatrix : use GGUF to store imatrix data	2024-09-06 17:17:25 -04:00
Francis Couture-Harpin	347247a24e	imatrix : fix segfault when using a single chunk per batch	2024-08-20 15:35:56 -04:00
Francis Couture-Harpin	bce54642c8	imatrix : allow processing multiple chunks per batch * perplexity : simplify filling the batch	2024-08-20 15:17:24 -04:00