llama : clean-up

llama : suffix the internal APIs with "_impl"
ggml-ci
2026-04-23 16:37:33 +03:00 · 2024-07-23 08:38:50 +03:00 · 2024-07-22 19:59:00 +03:00 · 2024-07-22 19:46:13 +03:00 · 2024-07-22 19:46:13 +03:00 · 2024-07-22 19:46:11 +03:00
26 changed files with 669 additions and 701 deletions
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -14,9 +14,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
-    echo "Building with static libs" && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
-    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
+    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-cli

 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@@ -14,7 +14,6 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
-    echo "Building with dynamic libs" && \
    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-server

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,17 +1,12 @@
-# Pull requests (for contributors)
+# Pull requests

+- Always squash-merge the PR before merging
+- Use the following format for your final commit: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
 - Test your changes:
  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
  - Execute [the full CI locally on your machine](ci/README.md) before publishing
 - Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
-  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
-
-# Pull requests (for collaborators)
-
- Squash-merge PRs
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
+  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience

 # Coding guidelines

--- a/2
+++ b/2
@@ -1322,7 +1322,7 @@ llama-finetune: examples/finetune/finetune.cpp \
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

 llama-export-lora: examples/export-lora/export-lora.cpp \
-	$(OBJ_ALL)
+	$(OBJ_GGML) common/log.h
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

--- a/README.md
+++ b/README.md
@@ -138,7 +138,6 @@ Typically finetunes of the base models below are supported as well.

 Unless otherwise noted these projects are open-source with permissive licensing:

- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
 - [iohub/collama](https://github.com/iohub/coLLaMA)
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
 - [nat/openplayground](https://github.com/nat/openplayground)
@@ -182,9 +181,6 @@ Unless otherwise noted these projects are open-source with permissive licensing:

 - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp

-**Games:**
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
-
 ## Demo

 <details>
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -694,6 +694,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
        return true;
    }
+    if (arg == "--lora-base") {
+        CHECK_ARG
+        params.lora_base = argv[i];
+        return true;
+    }
    if (arg == "--control-vector") {
        CHECK_ARG
        params.control_vectors.push_back({ 1.0f, argv[i], });
@@ -1269,7 +1274,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        CHECK_ARG
        params.out_file = argv[i];
        params.cvector_outfile = argv[i];
-        params.lora_outfile = argv[i];
        return true;
    }
    if (arg == "-ofreq" || arg == "--output-frequency") {
@@ -1579,8 +1583,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "       --override-kv KEY=TYPE:VALUE",
                                                                        "advanced option to override model metadata by key. may be specified multiple times.\n"
                                                                        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
-    options.push_back({ "*",           "       --lora FNAME",           "apply LoRA adapter (can be repeated to use multiple adapters)" });
-    options.push_back({ "*",           "       --lora-scaled FNAME S",  "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
+    options.push_back({ "*",           "       --lora FNAME",           "apply LoRA adapter (implies --no-mmap)" });
+    options.push_back({ "*",           "       --lora-scaled FNAME S",  "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
+    options.push_back({ "*",           "       --lora-base FNAME",      "optional model to use as a base for the layers modified by the LoRA adapter" });
    options.push_back({ "*",           "       --control-vector FNAME", "add a control vector\n"
                                                                        "note: this argument can be repeated to add multiple control vectors" });
    options.push_back({ "*",           "       --control-vector-scaled FNAME SCALE",
@@ -1671,13 +1676,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "cvector",     "       --pca-iter N",           "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
    options.push_back({ "cvector",     "       --method {pca,mean}",    "dimensionality reduction method to be used (default: pca)" });

-    options.push_back({ "export-lora" });
-    options.push_back({ "export-lora", "-m,    --model",                "model path from which to load base model (default '%s')", params.model.c_str() });
-    options.push_back({ "export-lora", "       --lora FNAME",           "path to LoRA adapter  (can be repeated to use multiple adapters)" });
-    options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during computation (default: %d)", params.n_threads });
-    options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });
-
    printf("usage: %s [options]\n", argv[0]);

    for (const auto & o : options) {
@@ -2723,7 +2721,7 @@ std::string llama_chat_format_single(const struct llama_model * model,
        const llama_chat_msg & new_msg,
        bool add_ass) {
    std::ostringstream ss;
-    auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
+    auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
    std::vector<llama_chat_msg> chat_new(past_msg);
    // if the past_msg ends with a newline, we must preserve it in the formatted version
    if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
@@ -3168,6 +3166,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
        }
        fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
    }
+    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
    fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
--- a/common/common.h
+++ b/common/common.h
@@ -128,6 +128,7 @@ struct gpt_params {

    // TODO: avoid tuple, use struct
    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
+    std::string lora_base  = "";                              // base model path for the lora adapter

    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale

@@ -254,8 +255,6 @@ struct gpt_params {
    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";

    bool spm_infill = false; // suffix/prefix/middle pattern for infill
-
-    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
 };

 void gpt_params_handle_hf_token(gpt_params & params);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2084,7 +2084,6 @@ class Phi3MiniModel(Model):
        self.gguf_writer.add_rope_dimension_count(rope_dims)
        self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
        self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))

        # write rope scaling for long context (128k) model
        rope_scaling = self.find_hparam(['rope_scaling'], True)
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@@ -6,11 +6,12 @@ Apply LORA adapters to base model and export the resulting model.
 usage: llama-export-lora [options]

 options:
-  -m,    --model                  model path from which to load base model (default '')
-         --lora FNAME             path to LoRA adapter  (can be repeated to use multiple adapters)
-         --lora-scaled FNAME S    path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)
-  -t,    --threads N              number of threads to use during computation (default: 4)
-  -o,    --output FNAME           output file (default: 'ggml-lora-merged-f16.gguf')
+  -h, --help                         show this help message and exit
+  -m FNAME, --model-base FNAME       model path from which to load base model (default '')
+  -o FNAME, --model-out FNAME        path to save exported model (default '')
+  -l FNAME, --lora FNAME             apply LoRA adapter
+  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S
+  -t N, --threads N                  number of threads to use during computation (default: 4)
 ```

 For example:
@@ -19,7 +20,7 @@ For example:
 ./bin/llama-export-lora \
    -m open-llama-3b-v2-q8_0.gguf \
    -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-    --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
+    -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
 ```

-Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters.
+Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -1,406 +1,465 @@
+
 #include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"

-#include <map>
 #include <vector>
 #include <string>
 #include <thread>
-#include <fstream>

-static bool g_verbose = false;
-
-static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
-    int id = gguf_find_key(ctx_gguf, key.c_str());
-    return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
-}
-
-static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
-    int id = gguf_find_key(ctx_gguf, key.c_str());
-    return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
-}
-
-static void zeros(std::ofstream & file, size_t n) {
-    char zero = 0;
-    for (size_t i = 0; i < n; ++i) {
-        file.write(&zero, 1);
-    }
-}
-
-static std::string ggml_ne_string(const ggml_tensor * t) {
-    std::string str;
-    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
-        str += std::to_string(t->ne[i]);
-        if (i + 1 < GGML_MAX_DIMS) {
-            str += ", ";
-        }
-    }
-    return str;
-}
-
-static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
-    struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ ctx_ggml,
-    };
-    struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
-    if (!ctx_gguf) {
-        throw std::runtime_error("failed to load input GGUF from " + fname);
-    }
-    return ctx_gguf;
-}
-
-static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    std::string result;
-    for (size_t pos = 0; ; pos += search.length()) {
-        auto new_pos = s.find(search, pos);
-        if (new_pos == std::string::npos) {
-            result += s.substr(pos, s.size() - pos);
-            break;
-        }
-        result += s.substr(pos, new_pos - pos) + replace;
-        pos = new_pos;
-    }
-    s = std::move(result);
-}
-
-struct file_input {
-    struct ggml_context * ctx_meta = nullptr;
-    struct gguf_context * ctx_gguf = nullptr;
-    std::ifstream f_in;
-    std::map<std::string, ggml_tensor *> tensors;
-    float alpha;
+struct lora_info {
+    std::string filename;
    float scale;
-
-    file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
-        if (!f_in.is_open()) {
-            throw std::runtime_error("failed to open input gguf from " + fname);
-        }
-
-        ctx_gguf = load_gguf(fname, &ctx_meta);
-        alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
-        printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
-
-        for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
-            std::string name(cur->name);
-            tensors[name] = cur;
-            if (g_verbose) {
-                printf("%s: %s\n", __func__, cur->name);
-            }
-        }
-    }
-
-    ggml_tensor * get_tensor(std::string name) {
-        if (tensors.find(name) == tensors.end()) {
-            return nullptr;
-        }
-        return tensors[name];
-    }
-
-    void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
-        if (tensors.find(name) == tensors.end()) {
-            throw std::runtime_error("cannot find tensor with name: " + name);
-        }
-        auto len = ggml_nbytes(tensors[name]);
-        if (buf.size() < len) {
-            buf.resize(len);
-        }
-        auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
-        auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
-        f_in.seekg(offset);
-        f_in.read((char* )buf.data(), len);
-    }
-
-    ~file_input() {
-        gguf_free(ctx_gguf);
-        ggml_free(ctx_meta);
-    }
 };

-struct lora_merge_ctx {
-    // input base model + adapters
-    file_input base_model;
-    std::vector<std::unique_ptr<file_input>> adapters;
-
-    // for computing merged tensor
+struct export_lora_params {
+    std::string fn_model_base;
+    std::string fn_model_out;
+    std::vector<struct lora_info> lora;
    int n_threads;
-    ggml_backend_t backend = nullptr;
-    ggml_gallocr_t allocr = nullptr;
-    std::vector<uint8_t> read_buf;
+};

-    // output file
-    struct gguf_context * ctx_out;
-    struct ggml_context * ctx_out_ggml;
-    std::ofstream fout;
+struct lora_data {
+    struct lora_info     info;
+    std::vector<uint8_t> data;
+    struct ggml_context * ctx;

-    lora_merge_ctx(
-            std::string & base_fname,
-            std::vector<std::tuple<std::string, float>> & lora_files,
-            std::string & outfile,
-            int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
-        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
+    uint32_t lora_r;
+    uint32_t lora_alpha;
+};

-        if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
-            throw std::runtime_error("split model is not yet supported");
-        }
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;

-        for (auto lora_inp : lora_files) {
-            auto fname = std::get<0>(lora_inp);
-            auto scale = std::get<1>(lora_inp);
-            std::unique_ptr<file_input> adapter(new file_input(fname, scale));
-            check_metadata_lora(adapter.get());
-            adapters.push_back(std::move(adapter));
-        }
-
-        ctx_out = gguf_init_empty();
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-        ctx_out_ggml = ggml_init(params);
-        backend = ggml_backend_cpu_init();
-        allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
-    }
-
-    void check_metadata_lora(file_input * adapter) {
-        auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
-        if (general_type != "adapter") {
-            throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
-        }
-
-        auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
-        if (adapter_type != "lora") {
-            throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
-        }
-
-        auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
-        auto general_arch_lora = get_kv_str(adapter->ctx_gguf,   "general.architecture");
-        if (general_arch_base != general_arch_lora) {
-            throw std::runtime_error("model arch and LoRA arch mismatch");
-        }
-    }
-
-    ggml_type get_out_tensor_type(struct ggml_tensor * t) {
-        if (t->type == GGML_TYPE_F32) {
-            return GGML_TYPE_F32;
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
        } else {
-            return GGML_TYPE_F16;
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
        }
    }

-    void run_merge() {
-        // prepare metadata
-        gguf_set_kv(ctx_out, base_model.ctx_gguf);
-        // output is forced to f16 for now
-        gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
-
-        // check if all lora adapters have the same tensors
-        // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
-        static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
-        if (adapters.size() > 1) {
-            for (size_t i = 1; i < adapters.size(); ++i) {
-                if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
-                    throw std::runtime_error(err_no_subset_adapter);
-                }
-                for (auto & it : adapters[i]->tensors) {
-                    if (adapters[0]->get_tensor(it.first) == nullptr) {
-                        throw std::runtime_error(err_no_subset_adapter);
-                    }
-                }
-            }
-        }
-
-        // if true, this tensor can be lora-merged. if false, we skip merging and just copy data to outfile
-        std::vector<std::pair<struct ggml_tensor *, bool>> base_tensors;
-        for (auto & it : base_model.tensors) {
-            bool t_a = true;
-            bool t_b = true;
-            for (auto & adapter : adapters) {
-                t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
-                t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
-            }
-            auto base_tensor = it.second;
-            struct ggml_tensor * out_tensor;
-            if (!t_a && !t_b) {
-                // only copy
-                out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
-                ggml_set_name(out_tensor, base_tensor->name);
-                base_tensors.push_back(std::make_pair(out_tensor, false));
-            } else if (t_a && t_b) {
-                // need merging
-                out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
-                out_tensor->type = get_out_tensor_type(base_tensor);
-                ggml_set_name(out_tensor, base_tensor->name);
-                base_tensors.push_back(std::make_pair(out_tensor, true));
-            } else {
-                throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
-            }
-            gguf_add_tensor(ctx_out, out_tensor);
-        }
-
-        // placeholder for the meta data
-        {
-            size_t meta_size = gguf_get_meta_size(ctx_out);
-            zeros(fout, meta_size);
-        }
-
-        // process base model tensors
-        size_t n_merged = 0;
-        for (auto & it : base_tensors) {
-            if (it.second) {
-                merge_tensor(it.first);
-                n_merged++;
-            } else {
-                copy_tensor(it.first);
-            }
-        }
-
-        // write output metadata
-        {
-            std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
-            gguf_get_meta_data(ctx_out, data.data());
-            fout.seekp(0);
-            fout.write((const char *)data.data(), data.size());
-        }
-
-        printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
-        printf("%s : wrote %ld tensors to output file\n", __func__, base_tensors.size());
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
    }

-    void copy_tensor(struct ggml_tensor * base) {
-        printf("%s :  %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
-        size_t len = ggml_nbytes(base);
-        base_model.read_tensor_data(base->name, read_buf);
-        fout.write((char* )read_buf.data(), len);
-        zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
    }

-    void merge_tensor(struct ggml_tensor * base) {
-        std::string name_base(base->name);
-        std::string name_lora_a = name_base + ".lora_a";
-        std::string name_lora_b = name_base + ".lora_b";
-
-        printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
-
-        // context for input tensor
-        std::vector<struct ggml_tensor *> inp_a(adapters.size());
-        std::vector<struct ggml_tensor *> inp_b(adapters.size());
-        struct ggml_init_params params {
-            /*.mem_size   =*/ ggml_tensor_overhead()*(1+adapters.size()*2),
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-        struct ggml_context * ctx = ggml_init(params);
-
-        // alloc tensors
-        struct ggml_tensor * inp = ggml_dup_tensor(ctx, base);
-        for (size_t i = 0; i < adapters.size(); ++i) {
-            auto t_a = adapters[i]->get_tensor(name_lora_a);
-            auto t_b = adapters[i]->get_tensor(name_lora_b);
-            inp_a[i] = ggml_dup_tensor(ctx, t_a);
-            inp_b[i] = ggml_dup_tensor(ctx, t_b);
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
        }
-        ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
-
-        // load data to backend buffer
-        base_model.read_tensor_data(name_base, read_buf);
-        ggml_backend_tensor_set(inp, read_buf.data(), 0, ggml_nbytes(inp));
-        for (size_t i = 0; i < adapters.size(); ++i) {
-            adapters[i]->read_tensor_data(name_lora_a, read_buf);
-            ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
-            adapters[i]->read_tensor_data(name_lora_b, read_buf);
-            ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            die_fmt("read error: %s", strerror(errno));
        }
-
-        // build graph
-        struct ggml_cgraph * gf;
-        {
-            static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
-            static std::vector<uint8_t> buf(buf_size);
-            struct ggml_init_params params0 = {
-                /*.mem_size   =*/ buf_size,
-                /*.mem_buffer =*/ buf.data(),
-                /*.no_alloc   =*/ true,
-            };
-            struct ggml_context * ctx0 = ggml_init(params0);
-            gf = ggml_new_graph(ctx0);
-            struct ggml_tensor * cur = inp;
-            for (size_t i = 0; i < adapters.size(); ++i) {
-                struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, inp_a[i]));
-                struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, inp_b[i]);
-                // scale
-                const float alpha = adapters[i]->alpha;
-                const float rank  = (float) inp_b[i]->ne[0];
-                const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
-                delta = ggml_scale(ctx0, delta, scale);
-                cur = ggml_add(ctx0, cur, delta);
-                printf("%s :   + merging from adapter[%ld]\n", __func__, i);
-                printf("%s :     input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
-            }
-            cur = ggml_cast(ctx0, cur, get_out_tensor_type(base));
-            ggml_build_forward_expand(gf, cur);
-            ggml_free(ctx0);
+        if (ret != 1) {
+            die("unexpectedly reached end of file");
        }
-
-        // compute
-        {
-            ggml_gallocr_alloc_graph(allocr, gf);
-            ggml_backend_cpu_set_n_threads(backend, n_threads);
-            ggml_backend_graph_compute(backend, gf);
-        }
-
-        // write data to output file
-        {
-            auto result = gf->nodes[gf->n_nodes - 1];
-            size_t len = ggml_nbytes(result);
-            if (read_buf.size() < len) {
-                read_buf.resize(len);
-            }
-            ggml_backend_tensor_get(result, read_buf.data(), 0, len);
-            fout.write((char* )read_buf.data(), len);
-            zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
-        }
-
-        ggml_free(ctx);
-        ggml_backend_buffer_free(buffer);
    }

-    ~lora_merge_ctx() {
-        ggml_gallocr_free(allocr);
-        ggml_backend_free(backend);
-        gguf_free(ctx_out);
-        ggml_free(ctx_out_ggml);
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            die_fmt("write error: %s", strerror(errno));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    bool eof() {
+        return tell() >= size;
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
    }
 };

-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
+static struct export_lora_params get_default_export_lora_params() {
+    struct export_lora_params result;
+    result.fn_model_base = "";
+    result.fn_model_out  = "";
+    result.n_threads = GGML_DEFAULT_N_THREADS;
+    return result;
+}

-    printf("\nexample usage:\n");
-    printf("\n  %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
-    printf("\nNOTE: output model is F16\n");
+static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help                         show this help message and exit\n");
+    fprintf(stderr, "  -m FNAME, --model-base FNAME       model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
+    fprintf(stderr, "  -o FNAME, --model-out FNAME        path to save exported model (default '%s')\n", params->fn_model_out.c_str());
+    fprintf(stderr, "  -l FNAME, --lora FNAME             apply LoRA adapter\n");
+    fprintf(stderr, "  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S\n");
+    fprintf(stderr, "  -t N, --threads N                  number of threads to use during computation (default: %d)\n", params->n_threads);
+}
+
+static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
+    bool invalid_param = false;
+    std::string arg;
+    struct export_lora_params default_params = get_default_export_lora_params();
+    const std::string arg_prefix = "--";
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        if (arg == "-m" || arg == "--model-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_model_base = argv[i];
+        } else if (arg == "-o" || arg == "--model-out") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_model_out = argv[i];
+        } else if (arg == "-l" || arg == "--lora") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            struct lora_info lora;
+            lora.filename = argv[i];
+            lora.scale = 1.0f;
+            params->lora.push_back(lora);
+        } else if (arg == "-s" || arg == "--lora-scaled") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            struct lora_info lora;
+            lora.filename = argv[i];
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            lora.scale = std::stof(argv[i]);
+            params->lora.push_back(lora);
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_threads = std::stoi(argv[i]);
+            if (params->n_threads <= 0) {
+                params->n_threads = std::thread::hardware_concurrency();
+            }
+        } else if (arg == "-h" || arg == "--help") {
+            export_lora_print_usage(argc, argv, &default_params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
+            export_lora_print_usage(argc, argv, &default_params);
+            exit(1);
+        }
+    }
+
+    if (params->fn_model_base == default_params.fn_model_base) {
+        fprintf(stderr, "error: please specify a filename for model-base.\n");
+        export_lora_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+    if (params->fn_model_out == default_params.fn_model_out) {
+        fprintf(stderr, "error: please specify a filename for model-out.\n");
+        export_lora_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
+        export_lora_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+    return true;
+}
+
+static void free_lora(struct lora_data * lora) {
+    if (lora->ctx != NULL) {
+        ggml_free(lora->ctx);
+    }
+    delete lora;
+}
+
+static struct lora_data * load_lora(struct lora_info * info) {
+    struct lora_data * result = new struct lora_data;
+    result->info = *info;
+    result->ctx = NULL;
+    result->lora_r     = 1;
+    result->lora_alpha = 1;
+
+    struct llama_file file(info->filename.c_str(), "rb");
+    if (file.fp == NULL) {
+        fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
+            info->filename.c_str());
+        free_lora(result);
+        return NULL;
+    }
+
+    struct ggml_init_params params_ggml;
+    params_ggml.mem_size   = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
+    params_ggml.mem_buffer = NULL;
+    params_ggml.no_alloc   = true;
+    result->ctx = ggml_init(params_ggml);
+
+    uint32_t magic   = file.read_u32();
+    if (magic != LLAMA_FILE_MAGIC_GGLA) {
+        die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
+    }
+    uint32_t version = file.read_u32();
+    if (version != 1) {
+        die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
+    }
+    result->lora_r     = file.read_u32();
+    result->lora_alpha = file.read_u32();
+    // read tensor infos from file
+    std::vector<char> name_buf;
+    std::vector<struct ggml_tensor *> tensors;
+    std::vector<size_t> tensors_offset;
+    size_t total_nbytes_pad = 0;
+    while(!file.eof()) {
+        int64_t ne[4]   = {1,1,1,1};
+        uint32_t n_dims  = file.read_u32();
+        uint32_t namelen = file.read_u32();
+        uint32_t type    = file.read_u32();
+        for (uint32_t k = 0; k < n_dims; ++k) {
+            ne[k] = (int64_t)file.read_u32();
+        }
+        name_buf.clear();
+        name_buf.resize(namelen + 1, '\0');
+        file.read_raw(name_buf.data(), namelen);
+        file.seek((0-file.tell()) & 31, SEEK_CUR);
+        size_t offset = file.tell();
+        struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
+        ggml_set_name(tensor, name_buf.data());
+        size_t nbytes     = ggml_nbytes(tensor);
+        size_t nbytes_pad = ggml_nbytes_pad(tensor);
+        total_nbytes_pad += nbytes_pad;
+        tensors.push_back(tensor);
+        tensors_offset.push_back(offset);
+        file.seek(nbytes, SEEK_CUR);
+    }
+    // read tensor data
+    result->data.resize(total_nbytes_pad);
+    size_t data_offset = 0;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        struct ggml_tensor * tensor = tensors[i];
+        size_t offset     = tensors_offset[i];
+        size_t nbytes     = ggml_nbytes(tensor);
+        size_t nbytes_pad = ggml_nbytes_pad(tensor);
+        file.seek(offset, SEEK_SET);
+        tensor->data = result->data.data() + data_offset;
+        file.read_raw(tensor->data, nbytes);
+        data_offset += nbytes_pad;
+    }
+    return result;
+}
+
+
+static struct ggml_cgraph * build_graph_lora(
+    struct ggml_context * ctx,
+    struct ggml_tensor * tensor,
+    struct ggml_tensor * lora_a,
+    struct ggml_tensor * lora_b,
+    float scaling
+) {
+    struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
+    if (scaling != 1.0f) {
+        ab = ggml_scale(ctx, ab, scaling);
+    }
+    struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
+
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand (gf, res);
+    return gf;
+}
+
+static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
+    if (lora->ctx == NULL) {
+        return false;
+    }
+    std::string name = ggml_get_name(tensor);
+    std::string name_a = name + std::string(".loraA");
+    std::string name_b = name + std::string(".loraB");
+    struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
+    struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
+    if (lora_a == NULL || lora_b == NULL) {
+        return false;
+    }
+
+    float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
+
+    struct ggml_init_params params;
+    params.mem_size   = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
+    params.mem_buffer = NULL;
+    params.no_alloc   = true;
+    struct ggml_context * ctx = NULL;
+    struct ggml_gallocr * alloc = NULL;
+    struct ggml_cgraph  * gf = NULL;
+
+    ctx   = ggml_init(params);
+    alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
+    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
+
+    ggml_gallocr_alloc_graph(alloc, gf);
+
+    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
+    static std::vector<uint8_t> data_work;
+    data_work.resize(cplan.work_size);
+    cplan.work_data = data_work.data();
+
+    ggml_graph_compute(gf, &cplan);
+
+    ggml_gallocr_free(alloc);
+    ggml_free(ctx);
+    return true;
+}
+
+static void export_lora(struct export_lora_params * params) {
+    // load all loras
+    std::vector<struct lora_data *> loras;
+    for (size_t i = 0; i < params->lora.size(); ++i) {
+        struct lora_data * lora = load_lora(&params->lora[i]);
+        if (lora != NULL) {
+            loras.push_back(lora);
+        }
+    }
+    if (loras.size() == 0) {
+        fprintf(stderr, "warning: no lora adapters will be applied.\n");
+    }
+
+    // open input file
+    struct llama_file fin(params->fn_model_base.c_str(), "rb");
+    if (!fin.fp) {
+        die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
+    }
+
+    // open base model gguf, read tensors without their data
+    struct ggml_context * ctx_in;
+    struct gguf_init_params params_gguf;
+    params_gguf.no_alloc = true;
+    params_gguf.ctx      = &ctx_in;
+    struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
+
+    // create new gguf
+    struct gguf_context * gguf_out = gguf_init_empty();
+
+    // copy meta data from base model: kv and tensors
+    gguf_set_kv(gguf_out, gguf_in);
+    int n_tensors = gguf_get_n_tensors(gguf_in);
+    for (int i=0; i < n_tensors; ++i) {
+        const char * name = gguf_get_tensor_name(gguf_in, i);
+        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
+        gguf_add_tensor(gguf_out, tensor);
+    }
+
+    // create output file
+    struct llama_file fout(params->fn_model_out.c_str(), "wb");
+    if (!fout.fp) {
+        die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
+    }
+
+    // write gguf meta data
+    std::vector<uint8_t> meta;
+    meta.resize(gguf_get_meta_size(gguf_out));
+    gguf_get_meta_data(gguf_out, meta.data());
+    fout.write_raw(meta.data(), meta.size());
+
+    std::vector<uint8_t> data;
+    std::vector<uint8_t> padding;
+    for (int i=0; i < n_tensors; ++i) {
+        const char * name = gguf_get_tensor_name(gguf_in, i);
+        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
+
+        // read tensor data
+        data.resize(ggml_nbytes(tensor));
+        tensor->data = data.data();
+        size_t offset = gguf_get_tensor_offset(gguf_in, i);
+        fin.seek(offset + meta.size(), SEEK_SET);
+        fin.read_raw(data.data(), data.size());
+
+        // apply all loras
+        for (size_t k = 0; k < loras.size(); ++k) {
+            apply_lora(tensor, loras[k], params->n_threads);
+        }
+
+        // write tensor data + padding
+        padding.clear();
+        padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
+
+        GGML_ASSERT(fout.tell() == offset + meta.size());
+        // fout.seek(offset + meta.size(), SEEK_SET);
+        fout.write_raw(data.data(), data.size());
+        fout.write_raw(padding.data(), padding.size());
+
+        if (i % 2 == 0) {
+            printf(".");
+        }
+    }
    printf("\n");
+
+    // close gguf
+    gguf_free(gguf_out);
+    gguf_free(gguf_in);
+
+    // free loras
+    for (size_t i = 0; i < loras.size(); ++i) {
+        free_lora(loras[i]);
+    }
 }

 int main(int argc, char ** argv) {
-    gpt_params params;
+    struct export_lora_params params = get_default_export_lora_params();

-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    if (!export_lora_params_parse(argc, argv, &params)) {
        return 1;
    }

-    g_verbose = (params.verbosity == 1);
-    try {
-        lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
-        ctx.run_merge();
-    } catch (const std::exception & err) {
-        fprintf(stderr, "%s\n", err.what());
-        exit(EXIT_FAILURE);
-    }
-
-    printf("done, output file is %s\n", params.lora_outfile.c_str());
+    export_lora(&params);

    return 0;
 }
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -124,7 +124,6 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
    auto formatted = llama_chat_format_single(
        model, g_params->chat_template, chat_msgs, new_msg, role == "user");
    chat_msgs.push_back({role, content});
-    LOG("formatted: %s\n", formatted.c_str());
    return formatted;
 }

--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@@ -225,7 +225,7 @@
        throw new Error("already running");
      }
      controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) {
        const data = chunk.data;
        if (data.stop) {
          while (
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -479,7 +479,7 @@
        throw new Error("already running");
      }
      controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) {
        const data = chunk.data;

        if (data.stop) {
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -467,19 +467,16 @@ if (GGML_SYCL)
        message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
    endif()

-    check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
-    if ( DEFINED ENV{ONEAPI_ROOT})
-        message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
-    elseif(SUPPORTS_SYCL)
-        message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
-         If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
-         source /opt/intel/oneapi/setvars.sh")
-    else()
-        message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
+    if ( NOT DEFINED ENV{ONEAPI_ROOT})
+        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
    endif()
-    message(STATUS "SYCL found")
    #todo: AOT

+    find_package(IntelSYCL REQUIRED)
+    find_package(MKL REQUIRED)
+
+    message(STATUS "SYCL found")
+
    list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)

    if (GGML_SYCL_F16)
@@ -490,9 +487,11 @@ if (GGML_SYCL)
        add_compile_definitions(GGML_SYCL_FORCE_MMQ)
    endif()

-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
+    add_compile_options(-I./) #include DPCT

+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
    if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
        add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
    else()
        add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
@@ -505,14 +504,14 @@ if (GGML_SYCL)
    list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")

    if (WIN32)
-        find_package(IntelSYCL REQUIRED)
-        find_package(MKL REQUIRED)
        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
    else()
+        add_compile_options(-I/${SYCL_INCLUDE_DIR})
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
+
        if (GGML_SYCL_TARGET STREQUAL "INTEL")
            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
        elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
        endif()
    endif()
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -459,7 +459,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half

 static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
+#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
    c = __builtin_amdgcn_sdot4(a, b, c, false);
 #elif defined(RDNA3)
    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
--- a/ggml/src/ggml-sycl/softmax.cpp
+++ b/ggml/src/ggml-sycl/softmax.cpp
@@ -152,8 +152,7 @@ static void soft_max_f32_sycl(const float * x, const float * mask,

    const sycl::range<3> block_dims(1, 1, nth);
    const sycl::range<3> block_nums(1, 1, nrows_x);
-    const size_t n_val_tmp = nth / WARP_SIZE;
-    const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + n_val_tmp);
+    const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);

    const uint32_t n_head_kv   = nrows_x/nrows_y;
    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -38,6 +38,8 @@
 #define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1
 #define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2

+#define VK_NUM_TYPES 16
+
 #define GGML_VK_MAX_NODES 8192

 #define MAX_VK_BUFFERS 256
@@ -160,23 +162,23 @@ struct vk_device_struct {
    vk_matmul_pipeline pipeline_matmul_f16_f32;
    vk_pipeline pipeline_matmul_split_k_reduce;

-    vk_matmul_pipeline pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
+    vk_matmul_pipeline pipeline_dequant_mul_mat_mat[VK_NUM_TYPES];

    vk_matmul_pipeline pipeline_matmul_id_f32;
    vk_matmul_pipeline pipeline_matmul_id_f16;
    vk_matmul_pipeline pipeline_matmul_id_f16_f32;

-    vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT];
+    vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[VK_NUM_TYPES];

-    vk_pipeline pipeline_dequant[GGML_TYPE_COUNT];
-    vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT];
-    vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT];
-    vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT];
+    vk_pipeline pipeline_dequant[VK_NUM_TYPES];
+    vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[VK_NUM_TYPES];
+    vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[VK_NUM_TYPES];
+    vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[VK_NUM_TYPES];

    vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
    vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
-    vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
-    vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
+    vk_pipeline pipeline_get_rows[VK_NUM_TYPES];
+    vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES];
    vk_pipeline pipeline_mul_f32;
    vk_pipeline pipeline_div_f32;
    vk_pipeline pipeline_add_f32;
@@ -1057,6 +1059,25 @@ static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& event
    );
 }

+static bool ggml_vk_build_shader(ggml_type type) {
+    switch(type) {
+    case GGML_TYPE_F16:
+    case GGML_TYPE_Q4_0:
+    case GGML_TYPE_Q4_1:
+    case GGML_TYPE_Q5_0:
+    case GGML_TYPE_Q5_1:
+    case GGML_TYPE_Q8_0:
+    case GGML_TYPE_Q2_K:
+    case GGML_TYPE_Q3_K:
+    case GGML_TYPE_Q4_K:
+    case GGML_TYPE_Q5_K:
+    case GGML_TYPE_Q6_K:
+        return true;
+    default:
+        return false;
+    }
+}
+
 static void ggml_vk_load_shaders(vk_device& device) {
    VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");

@@ -1091,7 +1112,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();

    device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
    device->pipeline_matmul_id_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
@@ -1106,7 +1126,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
    device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
    device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
    device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();

    if (device->fp16) {
        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
@@ -1207,13 +1226,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);

-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1304,13 +1316,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
    } else {
        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
        ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1410,13 +1415,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);

-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
        ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1507,13 +1505,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
    }

    // mul mat vec
@@ -1529,7 +1520,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);

    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32",  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32",  mul_mat_vec_f16_f16_f32_len,  mul_mat_vec_f16_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1543,7 +1533,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);

    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32",  mul_mat_vec_id_f16_f32_len,  mul_mat_vec_id_f16_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1557,7 +1546,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);

    // dequant shaders
    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16",   dequant_f32_len,  dequant_f32_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@@ -1571,7 +1559,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);

    // get_rows
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32",  get_rows_f32_len,  get_rows_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@@ -1581,7 +1568,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);

    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32",  get_rows_f32_f32_len,  get_rows_f32_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32",  get_rows_f16_f32_len,  get_rows_f16_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@@ -1590,7 +1576,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);

    ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);

@@ -2102,7 +2087,6 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ4_NL:
            break;
        default:
            return nullptr;
@@ -2139,7 +2123,6 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ4_NL:
            break;
        default:
            return nullptr;
@@ -2165,7 +2148,6 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ4_NL:
            break;
        default:
            return nullptr;
@@ -2199,7 +2181,6 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ4_NL:
            break;
        default:
            return nullptr;
@@ -2225,7 +2206,6 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ4_NL:
            break;
        default:
            return nullptr;
@@ -3451,7 +3431,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *

    const uint64_t nei0 = ids->ne[0];
    const uint64_t nei1 = ids->ne[1];
-    GGML_ASSERT(nei0 * nei1 <= 3072);
+    GGML_ASSERT(nei0 * nei1 <= 2048);

    const uint32_t nbi1 = ids->nb[1];
    const uint32_t nbi2 = ids->nb[2];
@@ -3463,6 +3443,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *

    const uint64_t n_as = ne02;

+    GGML_ASSERT(n_as <= 8);
+
    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
    ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
    ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
@@ -4641,22 +4623,22 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
        }
    }

-    ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
+    ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
    if (split_k > 1) {
-        ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
+        ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);

        if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
            // Resize buffer
            if (ctx->prealloc_split_k != nullptr) {
                ggml_vk_destroy_buffer(ctx->prealloc_split_k);
            }
-            ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
+            ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
        }
    }

-    vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer d_X = ggml_vk_create_buffer_check(ctx, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer d_Y = ggml_vk_create_buffer_check(ctx, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer d_D = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);

    X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne);
    Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne);
@@ -4683,12 +4665,12 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
        }
    }

-    ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
-    ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
+    ggml_vk_buffer_write(ctx, d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
+    ggml_vk_buffer_write(ctx, d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);

    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
    for (size_t i = 0; i < num_it; i++) {
-        ggml_vk_ctx_begin(ctx->device, subctx);
+        ggml_vk_ctx_begin(ctx, subctx);
        ggml_vk_matmul(
            ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
            m, n, k,
@@ -4707,7 +4689,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
    double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;

    // copy dst to host
-    ggml_vk_buffer_read(d_D, 0, d, sizeof(float) * d_ne);
+    ggml_vk_buffer_read(ctx, d_D, 0, d, sizeof(float) * d_ne);

    float * d_chk = (float *) malloc(sizeof(float) * d_ne);

@@ -4783,7 +4765,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t

        if (split_k > 1) {
            float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
-            ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
+            ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);

            std::cerr << "d_buf0: " << std::endl << std::endl;
            ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
@@ -4803,8 +4785,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t

    free(d_chk);

-    ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
-    ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
+    ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
+    ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue);

    ggml_vk_destroy_buffer(d_X);
    ggml_vk_destroy_buffer(d_Y);
@@ -4852,21 +4834,88 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
    }
 }

-static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
-    ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
-}
+static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
+    VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")");
+    // Check transfers are correct
+    vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);

-static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) {
-    if (quant == GGML_TYPE_F32) {
-        memcpy(to, from, sizeof(float) * ne);
-        return;
+    float * x;
+    float * y;
+    if (pinned) {
+        x = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
+        y = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
+    } else {
+        x = (float *) malloc(sizeof(float) * ne);
+        y = (float *) malloc(sizeof(float) * ne);
    }

-    ggml_type_traits_t tt = ggml_internal_get_type_traits(quant);
+    for (size_t i = 0; i < ne; i++) {
+        x[i] = rand() / (float)RAND_MAX;
+    }

-    ggml_to_float_t dequant_fn = tt.to_float;
+    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+    ggml_vk_ctx_begin(ctx, subctx);

-    dequant_fn(from, to, ne);
+    auto begin = std::chrono::high_resolution_clock::now();
+
+    ggml_vk_buffer_write_async(ctx, subctx, buffer, 0, x, sizeof(float) * ne);
+
+    for (auto& cpy : subctx->in_memcpys) {
+        memcpy(cpy.dst, cpy.src, cpy.n);
+    }
+    subctx->in_memcpys.clear();
+
+    ggml_vk_ctx_end(subctx);
+    ggml_vk_submit(subctx, ctx->fence);
+    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
+    ctx->device->device.resetFences({ ctx->fence });
+
+    auto end = std::chrono::high_resolution_clock::now();
+
+    double ms_to_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
+
+    ggml_vk_ctx_begin(ctx, subctx);
+
+    begin = std::chrono::high_resolution_clock::now();
+
+    ggml_vk_buffer_read_async(ctx, subctx, buffer, 0, y, sizeof(float) * ne);
+
+    ggml_vk_ctx_end(subctx);
+    ggml_vk_submit(subctx, ctx->fence);
+    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
+    ctx->device->device.resetFences({ ctx->fence });
+
+    for (auto& cpy : subctx->out_memcpys) {
+        memcpy(cpy.dst, cpy.src, cpy.n);
+    }
+    subctx->out_memcpys.clear();
+
+    end = std::chrono::high_resolution_clock::now();
+
+    double ms_from_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
+
+    double avg_err = 0.0;
+    for (size_t i = 0; i < ne; i++) {
+        avg_err += std::fabs(x[i] - y[i]);
+    }
+
+    double kb = ne * sizeof(float) / 1024.0;
+
+    std::cerr << "TEST TRANSFER " << kb << " KB to_gpu " << ms_to_gpu << "ms (" << kb / ms_to_gpu * 1000.0 / 1024.0 << " MB/s) from_gpu " << ms_from_gpu << "ms (" << kb / ms_from_gpu * 1000.0 / 1024.0 << " MB/s) avg_err=" << avg_err / ne << std::endl;
+
+    ggml_vk_destroy_buffer(buffer);
+
+    if (pinned) {
+        ggml_vk_host_free(ctx, x);
+        ggml_vk_host_free(ctx, y);
+    } else {
+        free(x);
+        free(y);
+    }
+}
+
+static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
+    ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
 }

 static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
@@ -4876,26 +4925,24 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
    const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
    float * x = (float *) malloc(x_sz);
    void * qx = malloc(qx_sz);
-    vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    float * x_ref = (float *) malloc(x_sz);
+    vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer x_buf = ggml_vk_create_buffer_check(ctx, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
    ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16);

    for (size_t i = 0; i < ne; i++) {
        x[i] = rand() / (float)RAND_MAX;
    }

-    vk_pipeline p = ggml_vk_get_to_fp16(ctx, quant);
+    vk_pipeline p = ctx->device->pipeline_dequant[quant];

    ggml_vk_quantize_data(x, qx, ne, quant);
-    ggml_vk_dequantize_data(qx, x_ref, ne, quant);

-    ggml_pipeline_allocate_descriptor_sets(ctx->device, p, 1);
+    ggml_pipeline_allocate_descriptor_sets(ctx, p, 1);

-    ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
+    ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);

    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
-    ggml_vk_ctx_begin(ctx->device, subctx);
+    ggml_vk_ctx_begin(ctx, subctx);
    const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
    ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
    ggml_vk_ctx_end(subctx);
@@ -4909,13 +4956,13 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
    auto end = std::chrono::high_resolution_clock::now();

    double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
-    ggml_vk_buffer_read(x_buf, 0, x_chk, x_sz_f16);
+    ggml_vk_buffer_read(ctx, x_buf, 0, x_chk, x_sz_f16);

    int first_err = -1;

    double avg_err = 0.0;
    for (size_t i = 0; i < ne; i++) {
-        double error = std::fabs(x_ref[i] - ggml_fp16_to_fp32(x_chk[i]));
+        double error = std::fabs(x[i] - ggml_fp16_to_fp32(x_chk[i]));
        avg_err += error;

        if (first_err < 0 && error > 0.05) {
@@ -4935,7 +4982,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
        }
        std::cerr << std::endl << "Expected result: " << std::endl << std::endl;
        for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) {
-            std::cerr << x_ref[i] << ", ";
+            std::cerr << x[i] << ", ";
        }
        std::cerr << std::endl;
    }
@@ -4945,7 +4992,6 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_

    free(x);
    free(qx);
-    free(x_ref);
    free(x_chk);
 }

@@ -4994,9 +5040,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
    float * x = (float *) malloc(x_sz);
    float * y = (float *) malloc(y_sz);
    void * qx = malloc(qx_sz);
-    vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer y_buf = ggml_vk_create_buffer_check(ctx->device, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer d_buf = ggml_vk_create_buffer_check(ctx->device, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer y_buf = ggml_vk_create_buffer_check(ctx, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer d_buf = ggml_vk_create_buffer_check(ctx, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
    float * d = (float *) malloc(d_sz);
    float * d_chk = (float *) malloc(d_sz);

@@ -5011,25 +5057,25 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
        y[i] = (i % k == i / k) ? 1.0f : 0.0f;
    }

-    ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
+    ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
    if (split_k > 1) {
-        ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
+        ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);

        if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
            // Resize buffer
            if (ctx->prealloc_split_k != nullptr) {
                ggml_vk_destroy_buffer(ctx->prealloc_split_k);
            }
-            ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
+            ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
        }
    }

-    ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
-    ggml_vk_buffer_write(y_buf, 0, y, y_sz);
+    ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);
+    ggml_vk_buffer_write(ctx, y_buf, 0, y, y_sz);

    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
    for (size_t i = 0; i < num_it; i++) {
-        ggml_vk_ctx_begin(ctx->device, subctx);
+        ggml_vk_ctx_begin(ctx, subctx);
        ggml_vk_matmul(
            ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
            m, n, k,
@@ -5048,7 +5094,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
    auto end = std::chrono::high_resolution_clock::now();

    double time_ms = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
-    ggml_vk_buffer_read(d_buf, 0, d, d_sz);
+    ggml_vk_buffer_read(ctx, d_buf, 0, d, d_sz);

    ggml_init_params iparams = {
        /*.mem_size   =*/ 1024*1024*1024,
@@ -5103,7 +5149,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,

        if (split_k > 1) {
            float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
-            ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
+            ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);

            std::cerr << "d_buf0: " << std::endl << std::endl;
            ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
@@ -5256,9 +5302,12 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm

 static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
 #if defined(GGML_VULKAN_RUN_TESTS)
-    ctx->staging = ggml_vk_create_buffer_check(ctx->device, 100ul * 1024ul * 1024ul,
+    ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+    ggml_vk_test_transfer(ctx, 8192 * 1000, false);
+    ggml_vk_test_transfer(ctx, 8192 * 1000, true);
+
    ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
    ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
    ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);
@@ -5270,90 +5319,85 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
    ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K);
    ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K);
    ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K);
-    ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_IQ4_NL);

    ggml_vk_test_matmul<ggml_fp16_t, ggml_fp16_t>(ctx, 512, 512, 100, 32, 100, 1, 2);

    ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 0);
    ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 1);
    ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 2);
-    // ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
-    // ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
-    // ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);
+    ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
+    ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
+    ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);

    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);

    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);

    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);

    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);

    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);

    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);

    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);

    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);

    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);

    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
-    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
-
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_IQ4_NL);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_IQ4_NL);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_IQ4_NL);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);

    std::cerr << std::endl;

@@ -5385,9 +5429,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0);
        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1);
        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2);
-        // ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
-        // ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
-        // ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
+        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
+        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
+        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
        std::cerr << std::endl;
    }

@@ -6219,7 +6263,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
                    case GGML_TYPE_Q4_K:
                    case GGML_TYPE_Q5_K:
                    case GGML_TYPE_Q6_K:
-                    case GGML_TYPE_IQ4_NL:
                        break;
                    default:
                        return false;
@@ -6248,7 +6291,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
                    case GGML_TYPE_Q5_0:
                    case GGML_TYPE_Q5_1:
                    case GGML_TYPE_Q8_0:
-                    case GGML_TYPE_IQ4_NL:
                        return true;
                    default:
                        return false;
--- a/ggml/src/vulkan-shaders/dequant_funcs.comp
+++ b/ggml/src/vulkan-shaders/dequant_funcs.comp
@@ -58,11 +58,3 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
 }
 #endif
-
-#if defined(DATA_A_IQ4_NL)
-vec2 dequantize(uint ib, uint iqs, uint a_offset) {
-    const float d = float(data_a[a_offset + ib].d);
-    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
-    return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
-}
-#endif
--- a/ggml/src/vulkan-shaders/dequant_iq4_nl.comp
+++ b/ggml/src/vulkan-shaders/dequant_iq4_nl.comp
@@ -1,30 +0,0 @@
-#version 450
-
-#include "dequant_head.comp"
-
-layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {block_iq4_nl data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
-
-void main() {
-    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
-
-    const uint tid = gl_LocalInvocationID.x % 64;
-    const uint il  = tid/32;
-    const uint ir  = tid%32;
-    const uint ib = 32*i + ir;
-    if (ib >= p.nel / 32) {
-        return;
-    }
-
-    const uint q_idx = 8*il;
-    const uint b_idx = 1024*i + 32*ir + q_idx;
-
-    const float d = float(data_a[ib].d);
-
-    [[unroll]] for (uint l = 0; l < 8; ++l) {
-        data_b[b_idx + l +  0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
-        data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >>  4]);
-    }
-}
--- a/ggml/src/vulkan-shaders/dequant_q4_0.comp
+++ b/ggml/src/vulkan-shaders/dequant_q4_0.comp
@@ -18,13 +18,15 @@ void main() {
        return;
    }

-    const uint q_idx = 8*il;
-    const uint b_idx = 1024*i + 32*ir + q_idx;
+    const uint b_idx = 1024*i + 32*ir + 8*il;

    const float d = float(data_a[ib].d);
+    const float dm = -8.0f * d;
+
+    const uint q_idx = 8*il;

    [[unroll]] for (uint l = 0; l < 8; ++l) {
-        data_b[b_idx + l +  0] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] & 0xF) - 8.0f));
-        data_b[b_idx + l + 16] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] >>  4) - 8.0f));
+        data_b[b_idx + l +  0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + dm);
+        data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >>  4) + dm);
    }
 }
--- a/ggml/src/vulkan-shaders/mul_mm.comp
+++ b/ggml/src/vulkan-shaders/mul_mm.comp
@@ -71,7 +71,7 @@ shared FLOAT_TYPE buf_a[BM * (BK+1)];
 shared FLOAT_TYPE buf_b[BN * (BK+1)];

 #ifdef MUL_MAT_ID
-shared u16vec2 row_ids[3072];
+shared u16vec2 row_ids[2048];
 #endif

 void main() {
@@ -380,19 +380,6 @@ void main() {

            buf_a[buf_idx    ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi    ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi    ] >> qhshift) & 3) << 4)) - 32));
            buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
-#elif defined(DATA_A_IQ4_NL)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a;
-
-            const uint ib = idx / 16;
-            const uint iqs = idx & 0xF;
-
-            const float d = float(data_a[ib].d);
-            const uint vui = uint(data_a[ib].qs[iqs]);
-            const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
-
-            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
-            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
 #endif
        }
        [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
--- a/ggml/src/vulkan-shaders/types.comp
+++ b/ggml/src/vulkan-shaders/types.comp
@@ -177,24 +177,3 @@ struct block_q6_K

 #define A_TYPE block_q6_K
 #endif
-
-// IQuants
-
-#if defined(DATA_A_IQ4_NL)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 32
-#define QUANT_R 2
-
-struct block_iq4_nl
-{
-    float16_t d;
-    uint8_t qs[QUANT_K/2];
-};
-
-#define A_TYPE block_iq4_nl
-
-const int8_t kvalues_iq4nl[16] = {
-    int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
-    int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
-};
-#endif
--- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -52,8 +52,7 @@ const std::vector<std::string> type_names = {
    "q3_k",
    "q4_k",
    "q5_k",
-    "q6_k",
-    "iq4_nl"
+    "q6_k"
 };

 void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) {
--- a/include/llama.h
+++ b/include/llama.h
@@ -529,16 +529,12 @@ extern "C" {
            struct llama_lora_adapter * adapter,
            float scale);

-    // Remove a specific LoRA adapter from given context
+    // Remove a LoRA adapter from given context
    // Return -1 if the adapter is not present in the context
    LLAMA_API int32_t llama_lora_adapter_remove(
            struct llama_context * ctx,
            struct llama_lora_adapter * adapter);

-    // Remove all LoRA adapters from given context
-    LLAMA_API void llama_lora_adapter_clear(
-            struct llama_context * ctx);
-
    // Manually free a LoRA adapter
    // Note: loaded adapters will be free when the associated model is deleted
    LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4889,7 +4889,6 @@ static void llm_load_hparams(
            } break;
        case LLM_ARCH_PHI3:
            {
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

                switch (hparams.n_layer) {
@@ -10749,7 +10748,7 @@ struct llm_build_context {
        struct ggml_tensor * inp_pos = build_inp_pos();

        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();

        for (int il = 0; il < n_layer; ++il) {
            auto residual = inpL;
@@ -10807,7 +10806,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
            }

            if (il == n_layer - 1) {
@@ -14014,23 +14013,18 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        "causal attention is not supported by this model"
    );

-    if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) {
+    if (lctx.inp_KQ_mask) {
        // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
        if (cparams.causal_attn && !lctx.is_encoding) {
            const int64_t n_kv     = kv_self.n;
            const int64_t n_tokens = batch.n_tokens;

+            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));

-            float * data     = nullptr;
+            float * data     = (float *) lctx.inp_KQ_mask->data;
            float * data_swa = nullptr;

-            if (lctx.inp_KQ_mask) {
-                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
-                data = (float *) lctx.inp_KQ_mask->data;
-            }
-
            if (lctx.inp_KQ_mask_swa) {
-                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
                data_swa = (float *) lctx.inp_KQ_mask_swa->data;
            }

@@ -14053,10 +14047,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                                f = 0.0f;
                            }
                        }
-
-                        if (data) {
-                            data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
-                        }
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = f;

                        // may need to cut off old tokens for sliding window
                        if (data_swa) {
@@ -14068,19 +14059,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                    }
                }

-                if (data) {
-                    for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                        for (int j = 0; j < n_kv; ++j) {
-                            data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
-                        }
-                    }
-                }
-
-                if (data_swa) {
-                    for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                        for (int j = 0; j < n_kv; ++j) {
-                            data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
-                        }
+                for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                    for (int j = 0; j < n_kv; ++j) {
+                        data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
                    }
                }
            }
@@ -16220,10 +16201,6 @@ int32_t llama_lora_adapter_remove(
    return -1;
 }

-void llama_lora_adapter_clear(struct llama_context * ctx) {
-    ctx->lora_adapters.clear();
-}
-
 void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
    delete adapter;
 }
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -1,3 +1,4 @@
+#include <iostream>
 #include <string>
 #include <vector>
 #include <sstream>
@@ -132,31 +133,13 @@ int main(void) {
        );
        formatted_chat.resize(res);
        std::string output(formatted_chat.data(), formatted_chat.size());
-        printf("%s\n", output.c_str());
-        printf("-------------------------\n");
+        std::cout << output << "\n-------------------------\n";
        assert(output == expected);
    }

-
-    // test llama_chat_format_single for system message
-    printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
+    // test llama_chat_format_single
+    std::cout << "\n\n=== llama_chat_format_single ===\n\n";
    std::vector<llama_chat_msg> chat2;
-    llama_chat_msg sys_msg{"system", "You are a helpful assistant"};
-
-    auto fmt_sys = [&](std::string tmpl) {
-        auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
-        printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
-        printf("-------------------------\n", output.c_str());
-        return output;
-    };
-    assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n");
-    assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n");
-    assert(fmt_sys("gemma")  == ""); // for gemma, system message is merged with user message
-    assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
-
-
-    // test llama_chat_format_single for user message
-    printf("\n\n=== llama_chat_format_single (user message) ===\n\n");
    chat2.push_back({"system", "You are a helpful assistant"});
    chat2.push_back({"user", "Hello"});
    chat2.push_back({"assistant", "I am assistant"});
@@ -164,13 +147,12 @@ int main(void) {

    auto fmt_single = [&](std::string tmpl) {
        auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
-        printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
-        printf("-------------------------\n", output.c_str());
+        std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n";
        return output;
    };
    assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
    assert(fmt_single("llama2") == "[INST] How are you [/INST]");
-    assert(fmt_single("gemma")  == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
+    assert(fmt_single("gemma") == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
    assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");

    return 0;
Author	SHA1	Message	Date
Georgi Gerganov	fe28a7b9d8	llama : clean-up Some checks failed flake8 Lint / Lint (push) Has been cancelled	2024-07-23 08:38:50 +03:00
Georgi Gerganov	dae3cae841	llama : suffix the internal APIs with "_impl" Some checks failed flake8 Lint / Lint (push) Waiting to run Python check requirements.txt / check-requirements (push) Has been cancelled Python Type-Check / pyright type-check (push) Has been cancelled ggml-ci	2024-07-22 19:59:00 +03:00
Georgi Gerganov	39fbaf9f50	llama : redirect external API to internal APIs ggml-ci	2024-07-22 19:46:13 +03:00
Georgi Gerganov	66ac80f5b9	make : update llama.cpp deps [no ci]	2024-07-22 19:46:13 +03:00
Georgi Gerganov	8fef5b1897	llama : move tokenizers into llama-vocab ggml-ci	2024-07-22 19:46:11 +03:00
Georgi Gerganov	e7dffa6bc7	llama : deprecate llama_sample_grammar	2024-07-22 19:44:12 +03:00
Georgi Gerganov	689d377916	cont ggml-ci	2024-07-22 19:44:12 +03:00
Georgi Gerganov	b4b242e6bd	cont : pre-fetch rules	2024-07-22 19:44:12 +03:00
Georgi Gerganov	5a71d1aefd	cont ggml-ci	2024-07-22 19:44:12 +03:00
Georgi Gerganov	675f305f31	llama : move grammar code into llama-grammar ggml-ci	2024-07-22 19:44:12 +03:00
Georgi Gerganov	0ddc8e361c	llama : move sampling code into llama-sampling ggml-ci	2024-07-22 19:44:10 +03:00