Revert "move ndk code to a new library (#6951 )" (#7282 )

This reverts commit efc8f767c8.
ggml : add RPC backend (#6829 )
2026-02-05 13:53:23 +02:00 · 2024-05-14 16:10:39 +03:00 · 2024-05-14 14:27:19 +03:00 · 2024-05-14 17:33:42 +10:00 · 2024-05-14 17:30:30 +10:00 · 2024-05-14 15:25:56 +10:00
52 changed files with 3611 additions and 1116 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -340,6 +340,36 @@ jobs:
          cd build
          ctest -L main --verbose

+  ubuntu-latest-cmake-rpc:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake -DLLAMA_RPC=ON ..
+          cmake --build . --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose
+
  ubuntu-22-cmake-vulkan:
    runs-on: ubuntu-22.04

@@ -663,6 +693,8 @@ jobs:
    strategy:
      matrix:
        include:
+          - build: 'rpc'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'noavx'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx2'
@@ -898,9 +930,9 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
-
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
    steps:
      - name: Clone
        id: checkout
@@ -932,6 +964,17 @@ jobs:
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
+          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
+          echo "cp oneAPI running time dll files to ./build/bin done"
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*

      - name: Upload artifacts
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -123,6 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
+option(LLAMA_RPC                             "llama: use RPC"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
@@ -296,7 +297,7 @@ if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
    endif()
-    if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
+    if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
        set(BLA_SIZEOF_INTEGER 8)
    endif()

@@ -494,6 +495,17 @@ if (LLAMA_MPI)
    endif()
 endif()

+if (LLAMA_RPC)
+    add_compile_definitions(GGML_USE_RPC)
+
+    if (WIN32)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
+    endif()
+
+    set(GGML_HEADERS_RPC ggml-rpc.h)
+    set(GGML_SOURCES_RPC ggml-rpc.cpp)
+endif()
+
 if (LLAMA_CLBLAST)
    find_package(CLBlast)
    if (CLBlast_FOUND)
@@ -1176,6 +1188,7 @@ add_library(ggml OBJECT
            ${GGML_SOURCES_OPENCL}    ${GGML_HEADERS_OPENCL}
            ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
            ${GGML_SOURCES_MPI}       ${GGML_HEADERS_MPI}
+            ${GGML_SOURCES_RPC}       ${GGML_HEADERS_RPC}
            ${GGML_SOURCES_EXTRA}     ${GGML_HEADERS_EXTRA}
            ${GGML_SOURCES_SYCL}      ${GGML_HEADERS_SYCL}
            ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
@@ -1281,17 +1294,6 @@ install(
        WORLD_READ
        WORLD_EXECUTE
    DESTINATION ${CMAKE_INSTALL_BINDIR})
-install(
-    FILES convert-lora-to-ggml.py
-    PERMISSIONS
-        OWNER_READ
-        OWNER_WRITE
-        OWNER_EXECUTE
-        GROUP_READ
-        GROUP_EXECUTE
-        WORLD_READ
-        WORLD_EXECUTE
-    DESTINATION ${CMAKE_INSTALL_BINDIR})
 if (LLAMA_METAL)
    install(
        FILES ggml-metal.metal
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -365,47 +365,6 @@ function gg_run_open_llama_3b_v2 {

    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log

-    # lora
-    function compare_ppl {
-        qnt="$1"
-        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
-            return 20
-        fi
-
-        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
-        return 0
-    }
-
-    path_lora="../models-mnt/open-llama/3B-v2/lora"
-    path_shakespeare="../models-mnt/shakespeare"
-
-    shakespeare="${path_shakespeare}/shakespeare.txt"
-    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
-
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
-    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
-
-    python3 ../convert-lora-to-ggml.py ${path_lora}
-
-    # f16
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
-    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # q8_0
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
-    compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # q8_0 + f16 lora-base
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
-    compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
    set +e
 }

@@ -416,7 +375,6 @@ function gg_sum_open_llama_3b_v2 {
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@@ -429,11 +387,6 @@ function gg_sum_open_llama_3b_v2 {
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
-    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
-    gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
-    gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
-    gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
 }

 # open_llama_7b_v2
@@ -549,48 +502,6 @@ function gg_run_open_llama_7b_v2 {

    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log

-    # lora
-    function compare_ppl {
-        qnt="$1"
-        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
-            return 20
-        fi
-
-        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
-        return 0
-    }
-
-    path_lora="../models-mnt/open-llama/7B-v2/lora"
-    path_shakespeare="../models-mnt/shakespeare"
-
-    shakespeare="${path_shakespeare}/shakespeare.txt"
-    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
-
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
-    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
-
-    python3 ../convert-lora-to-ggml.py ${path_lora}
-
-    # f16
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
-    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # currently not supported by the CUDA backend
-    # q8_0
-    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
-    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
-    #compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # q8_0 + f16 lora-base
-    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
-    #compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
    set +e
 }

@@ -601,7 +512,6 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@@ -614,11 +524,6 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
-    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
-    #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
-    #gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
-    #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
 }

 # bge-small
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1060,6 +1060,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
 #endif // GGML_USE_CUDA_SYCL_VULKAN
        return true;
    }
+    if (arg == "--rpc") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.rpc_servers = argv[i];
+        return true;
+    }
    if (arg == "--no-mmap") {
        params.use_mmap = false;
        return true;
@@ -1557,6 +1565,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
        printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
        printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
    }
+    printf("  --rpc SERVERS         comma separated list of RPC servers\n");
    printf("  --verbose-prompt      print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
    printf("  --no-display-prompt   don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
    printf("  -gan N, --grp-attn-n N\n");
@@ -1830,6 +1839,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
+    mparams.rpc_servers     = params.rpc_servers.c_str();
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
--- a/common/common.h
+++ b/common/common.h
@@ -82,6 +82,7 @@ struct gpt_params {
    float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
+    std::string rpc_servers       = "";    // comma separated list of RPC servers

    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -74,9 +74,9 @@ models = [
    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
-    {"name": "jina-en",        "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
-    {"name": "jina-es",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
-    {"name": "jina-de",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
+    {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
+    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
+    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
 ]

 # make directory "models/tokenizers" if it doesn't exist
@@ -145,8 +145,17 @@ for model in models:
    if tokt == TOKENIZER_TYPE.SPM:
        continue

+    # Skip if the tokenizer folder does not exist or there are other download issues previously
+    if not os.path.exists(f"models/tokenizers/{name}"):
+        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
+        continue
+
    # create the tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+    except OSError as e:
+        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
+        continue  # Skip to the next model if the tokenizer can't be loaded

    chktok = tokenizer.encode(chktxt)
    chkhsh = sha256(str(chktok).encode()).hexdigest()
@@ -287,8 +296,17 @@ for model in models:
    name = model["name"]
    tokt = model["tokt"]

+    # Skip if the tokenizer folder does not exist or there are other download issues previously
+    if not os.path.exists(f"models/tokenizers/{name}"):
+        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
+        continue
+
    # create the tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+    except OSError as e:
+        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
+        continue  # Skip this model and continue with the next one in the loop

    with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
        for text in tests:
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -12,7 +12,7 @@ import sys
 from enum import IntEnum
 from pathlib import Path
 from hashlib import sha256
-from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast, overload
+from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast

 import numpy as np
 import torch
@@ -48,7 +48,6 @@ class Model:

    dir_model: Path
    ftype: int
-    fname_out: Path
    is_big_endian: bool
    endianess: gguf.GGUFEndian
    use_temp_file: bool
@@ -56,20 +55,20 @@ class Model:
    part_names: list[str]
    is_safetensors: bool
    hparams: dict[str, Any]
-    gguf_writer: gguf.GGUFWriter
    block_count: int
    tensor_map: gguf.TensorNameMap
    tensor_names: set[str] | None
+    fname_out: Path
+    gguf_writer: gguf.GGUFWriter

    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH

-    def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
-        if self.__class__ == Model:
-            raise TypeError(f"{self.__class__.__name__!r} should not be directly instantiated")
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
+        if type(self) is Model:
+            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
        self.dir_model = dir_model
        self.ftype = ftype
-        self.fname_out = fname_out
        self.is_big_endian = is_big_endian
        self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
        self.use_temp_file = use_temp_file
@@ -79,10 +78,23 @@ class Model:
        if not self.is_safetensors:
            self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
        self.hparams = Model.load_hparams(self.dir_model)
-        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
        self.tensor_names = None
+        if self.ftype == gguf.LlamaFileType.GUESSED:
+            # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
+            _, first_tensor = next(self.get_tensors())
+            if first_tensor.dtype == torch.float16:
+                logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
+                self.ftype = gguf.LlamaFileType.MOSTLY_F16
+            else:
+                logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
+                self.ftype = gguf.LlamaFileType.MOSTLY_BF16
+        ftype_up: str = self.ftype.name.partition("_")[2].upper()
+        ftype_lw: str = ftype_up.lower()
+        # allow templating the file name with the output ftype, useful with the "auto" ftype
+        self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
+        self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)

    @classmethod
    def __init_subclass__(cls):
@@ -142,14 +154,27 @@ class Model:
            raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")

    def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
-        name: str = gguf.TENSOR_NAMES[key]
        if key not in gguf.MODEL_TENSORS[self.model_arch]:
            raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}")
+        name: str = gguf.TENSOR_NAMES[key]
        if "{bid}" in name:
            assert bid is not None
            name = name.format(bid=bid)
        return name + suffix

+    def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool:
+        if key not in gguf.MODEL_TENSORS[self.model_arch]:
+            return False
+        key_name: str = gguf.TENSOR_NAMES[key]
+        if "{bid}" in key_name:
+            if bid is None:
+                return False
+            key_name = key_name.format(bid=bid)
+        else:
+            if bid is not None:
+                return False
+        return name == (key_name + suffix)
+
    def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
        new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
        if new_name is None:
@@ -239,35 +264,64 @@ class Model:
                data: np.ndarray = data  # type hint
                n_dims = len(data.shape)
                data_dtype = data.dtype
-
-                # if f32 desired, convert any float16 to float32
-                if self.ftype == 0 and data_dtype == np.float16:
-                    data = data.astype(np.float32)
+                data_qtype: gguf.GGMLQuantizationType | None = None

                # when both are True, f32 should win
                extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
                extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)

                # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
-                extra_f32 = extra_f32 or n_dims == 1 or new_name.endswith("_norm.weight")
+                # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
+                extra_f32 = any(cond for cond in (
+                    extra_f32,
+                    n_dims == 1,
+                    new_name.endswith("_norm.weight"),
+                ))
+
+                # Some tensor types are always in float32
+                extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
+                    gguf.MODEL_TENSOR.FFN_GATE_INP,
+                    gguf.MODEL_TENSOR.POS_EMBD,
+                    gguf.MODEL_TENSOR.TOKEN_TYPES,
+                ))

                # if f16 desired, convert any float32 2-dim weight tensors to float16
-                extra_f16 = extra_f16 or (name.endswith(".weight") and n_dims >= 2)
+                extra_f16 = any(cond for cond in (
+                    extra_f16,
+                    (name.endswith(".weight") and n_dims >= 2),
+                ))

-                # when both extra_f32 and extra_f16 are False, convert to float32 by default
-                if self.ftype == 1 and data_dtype == np.float16 and (extra_f32 or not extra_f16):
-                    data = data.astype(np.float32)
+                if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
+                    if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
+                        data = gguf.quantize_bf16(data)
+                        assert data.dtype == np.int16
+                        data_qtype = gguf.GGMLQuantizationType.BF16

-                if self.ftype == 1 and data_dtype == np.float32 and extra_f16 and not extra_f32:
-                    data = data.astype(np.float16)
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
+                        data = gguf.quantize_q8_0(data)
+                        assert data.dtype == np.uint8
+                        data_qtype = gguf.GGMLQuantizationType.Q8_0

+                    else:  # default to float16 for quantized tensors
+                        if data_dtype != np.float16:
+                            data = data.astype(np.float16)
+                        data_qtype = gguf.GGMLQuantizationType.F16
+
+                if data_qtype is None:  # by default, convert to float32
+                    if data_dtype != np.float32:
+                        data = data.astype(np.float32)
+                    data_qtype = gguf.GGMLQuantizationType.F32
+
+                block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
                # reverse shape to make it similar to the internal ggml dimension order
-                shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
+                shape_str = f"""{{{', '.join(str(n) for n in reversed(
+                    (*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
+                )}}}"""

                # n_dims is implicit in the shape
-                logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data.dtype}, shape = {shape_str}")
+                logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")

-                self.gguf_writer.add_tensor(new_name, data)
+                self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)

    def write(self):
        self.write_tensors()
@@ -408,13 +462,13 @@ class Model:
            res = "dbrx"
        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
-            res = "jina-en"
+            res = "jina-v2-en"
        if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
-            res = "jina-es"
+            res = "jina-v2-es"
        if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
-            res = "jina-de"
+            res = "jina-v2-de"

        if res is None:
            logger.warning("\n")
@@ -792,6 +846,7 @@ class BaichuanModel(Model):
        self.gguf_writer.add_head_count(head_count)
        self.gguf_writer.add_head_count_kv(head_count_kv)
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+        self.gguf_writer.add_file_type(self.ftype)

        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
            if self.hparams["rope_scaling"].get("type") == "linear":
@@ -914,6 +969,7 @@ class XverseModel(Model):
        self.gguf_writer.add_head_count(head_count)
        self.gguf_writer.add_head_count_kv(head_count_kv)
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+        self.gguf_writer.add_file_type(self.ftype)

        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
            if self.hparams["rope_scaling"].get("type") == "linear":
@@ -1148,6 +1204,7 @@ class StableLMModel(Model):
        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
        self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
+        self.gguf_writer.add_file_type(self.ftype)

    _q_norms: list[dict[str, Tensor]] | None = None
    _k_norms: list[dict[str, Tensor]] | None = None
@@ -1524,6 +1581,7 @@ class QwenModel(Model):
        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)


@Model.register("Qwen2ForCausalLM")
@@ -1761,6 +1819,7 @@ class PlamoModel(Model):
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(5)  # hparams["num_key_value_heads"]) is wrong
        self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+        self.gguf_writer.add_file_type(self.ftype)

    def shuffle_attn_q_weight(self, data_torch):
        assert data_torch.size() == (5120, 5120)
@@ -1940,6 +1999,7 @@ in chat mode so that the conversation can end normally.")
        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
+        self.gguf_writer.add_file_type(self.ftype)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        num_heads = self.hparams["num_attention_heads"]
@@ -2044,12 +2104,6 @@ class BertModel(Model):

        return [(self.map_tensor_name(name), data_torch)]

-    def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
-        del new_name, bid, n_dims  # unused
-
-        # not used with get_rows, must be F32
-        return name == "embeddings.token_type_embeddings.weight"
-

@Model.register("NomicBertModel")
 class NomicBertModel(BertModel):
@@ -2339,92 +2393,30 @@ class JinaBertV2Model(BertModel):


 # tree of lazy tensors
-class LazyTorchTensor:
-    _meta: Tensor
-    _data: Tensor | None
-    _args: tuple
-    _func: Callable[[tuple], Tensor] | None
-
-    def __init__(self, *, meta: Tensor, data: Tensor | None = None, args: tuple = (), func: Callable[[tuple], Tensor] | None = None):
-        self._meta = meta
-        self._data = data
-        self._args = args
-        self._func = func
-
-    @staticmethod
-    def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
-        # TODO: dict and set
-        if isinstance(o, (list, tuple)):
-            L = []
-            for item in o:
-                L.append(LazyTorchTensor._recurse_apply(item, fn))
-            if isinstance(o, tuple):
-                L = tuple(L)
-            return L
-        elif isinstance(o, LazyTorchTensor):
-            return fn(o)
-        else:
-            return o
-
-    def _wrap_fn(self, fn: Callable, use_self: bool = False) -> Callable[[Any], LazyTorchTensor]:
-        def wrapped_fn(*args, **kwargs):
-            if kwargs is None:
-                kwargs = {}
-            args = ((self,) if use_self else ()) + args
-
-            meta_args = LazyTorchTensor._recurse_apply(args, lambda t: t._meta)
-
-            return LazyTorchTensor(meta=fn(*meta_args, **kwargs), args=args, func=lambda a: fn(*a, **kwargs))
-        return wrapped_fn
-
-    def __getattr__(self, __name: str) -> Any:
-        meta_attr = getattr(self._meta, __name)
-        if callable(meta_attr):
-            return self._wrap_fn(getattr(torch.Tensor, __name), use_self=True)
-        elif isinstance(meta_attr, torch.Tensor):
-            # for things like self.T
-            return self._wrap_fn(lambda s: getattr(s, __name))(self)
-        else:
-            return meta_attr
+class LazyTorchTensor(gguf.LazyBase):
+    _tensor_type = torch.Tensor
+    # to keep the type-checker happy
+    dtype: torch.dtype
+    shape: torch.Size

+    # only used when converting a torch.Tensor to a np.ndarray
    _dtype_map: dict[torch.dtype, type] = {
        torch.float16: np.float16,
        torch.float32: np.float32,
    }

-    def numpy(self) -> gguf.LazyTensor:
+    def numpy(self) -> gguf.LazyNumpyTensor:
        dtype = self._dtype_map[self.dtype]
-        return gguf.LazyTensor(lambda: LazyTorchTensor.to_eager(self).numpy(), dtype=dtype, shape=self.shape)
+        return gguf.LazyNumpyTensor(
+            meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
+            lazy=self._lazy,
+            args=(self,),
+            func=(lambda s: s[0].numpy())
+        )

-    @overload
-    @staticmethod
-    def to_eager(t: Tensor | LazyTorchTensor) -> Tensor: ...
-
-    @overload
-    @staticmethod
-    def to_eager(t: tuple) -> tuple: ...
-
-    @staticmethod
-    def to_eager(t: Any) -> Any:
-        def simple_to_eager(_t: LazyTorchTensor) -> Tensor:
-            # wake up the lazy tensor
-            if _t._data is None and _t._func is not None:
-                # recurse into its arguments
-                _t._args = LazyTorchTensor.to_eager(_t._args)
-                _t._data = _t._func(_t._args)
-            if _t._data is not None:
-                return _t._data
-            else:
-                raise ValueError(f"Could not compute lazy tensor {_t!r} with args {_t._args!r}")
-
-        # recurse into lists and/or tuples, keeping their structure
-        return LazyTorchTensor._recurse_apply(t, simple_to_eager)
-
-    @staticmethod
-    def from_eager(t: Tensor) -> Tensor:
-        if (t.__class__ == LazyTorchTensor):
-            return t
-        return LazyTorchTensor(meta=t.detach().to("meta"), data=t)  # type: ignore
+    @classmethod
+    def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor:
+        return torch.empty(size=shape, dtype=dtype, device="meta")

    @classmethod
    def __torch_function__(cls, func, types, args=(), kwargs=None):
@@ -2435,28 +2427,8 @@ class LazyTorchTensor:

        if func is torch.Tensor.numpy:
            return args[0].numpy()
-        if func is torch.equal:
-            eager_args = LazyTorchTensor.to_eager(args)
-            return func(*eager_args, **kwargs)

-        return LazyTorchTensor._wrap_fn(args[0], func)(*args, **kwargs)
-
-    # special methods bypass __getattr__, so they need to be added manually
-    # ref: https://docs.python.org/3/reference/datamodel.html#special-lookup
-    # NOTE: LazyTorchTensor can't be a subclass of Tensor (and then be used
-    #       as self._meta is currently used), because then the following
-    #       operations would by default not be wrapped, and so not propagated
-    #       when the tensor is made eager.
-    #       It's better to get non-silent errors for not-yet-supported operators.
-    # TODO: add more when needed to avoid clutter, or find a more concise way
-    def __neg__(self, *args):  # mamba
-        return self._wrap_fn(torch.Tensor.__neg__)(self, *args)
-
-    def __add__(self, *args):  # gemma
-        return self._wrap_fn(torch.Tensor.__add__)(self, *args)
-
-    def __getitem__(self, *args):  # bloom falcon refact internlm2
-        return self._wrap_fn(torch.Tensor.__getitem__)(self, *args)
+        return LazyTorchTensor._wrap_fn(func)(*args, **kwargs)


 def parse_args() -> argparse.Namespace:
@@ -2472,11 +2444,11 @@ def parse_args() -> argparse.Namespace:
    )
    parser.add_argument(
        "--outfile", type=Path,
-        help="path to write to; default: based on input",
+        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
    )
    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16"], default="f16",
-        help="output format - use f32 for float32, f16 for float16",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
    )
    parser.add_argument(
        "--bigendian", action="store_true",
@@ -2530,16 +2502,19 @@ def main() -> None:
        logger.error(f'Error: {args.model} is not a directory')
        sys.exit(1)

-    ftype_map = {
-        "f32": gguf.GGMLQuantizationType.F32,
-        "f16": gguf.GGMLQuantizationType.F16,
+    ftype_map: dict[str, gguf.LlamaFileType] = {
+        "f32": gguf.LlamaFileType.ALL_F32,
+        "f16": gguf.LlamaFileType.MOSTLY_F16,
+        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
+        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
+        "auto": gguf.LlamaFileType.GUESSED,
    }

    if args.outfile is not None:
        fname_out = args.outfile
    else:
        # output in the same directory as the model by default
-        fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
+        fname_out = dir_model / 'ggml-model-{ftype}.gguf'

    logger.info(f"Loading model: {dir_model.name}")

@@ -2555,14 +2530,16 @@ def main() -> None:
        logger.info("Set model tokenizer")
        model_instance.set_vocab()

+        model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
+
        if args.vocab_only:
-            logger.info(f"Exporting model vocab to '{fname_out}'")
+            logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
            model_instance.write_vocab()
        else:
-            logger.info(f"Exporting model to '{fname_out}'")
+            logger.info(f"Exporting model to '{model_instance.fname_out}'")
            model_instance.write()

-        logger.info(f"Model successfully exported to '{fname_out}'")
+        logger.info(f"Model successfully exported to '{model_instance.fname_out}'")


 if __name__ == '__main__':
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -1,150 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import logging
-import json
-import os
-import struct
-import sys
-from pathlib import Path
-from typing import Any, BinaryIO, Sequence
-
-import numpy as np
-import torch
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-
-logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger("lora-to-gguf")
-
-NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
-
-
-def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
-    fout.write(b"ggla"[::-1])  # magic (ggml lora)
-    fout.write(struct.pack("i", 1))  # file version
-    fout.write(struct.pack("i", params["r"]))
-    # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
-    # but some models ship a float value instead
-    # let's convert to int, but fail if lossless conversion is not possible
-    assert (
-        int(params["lora_alpha"]) == params["lora_alpha"]
-    ), "cannot convert float to int losslessly"
-    fout.write(struct.pack("i", int(params["lora_alpha"])))
-
-
-def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
-    sname = name.encode("utf-8")
-    fout.write(
-        struct.pack(
-            "iii",
-            len(shape),
-            len(sname),
-            NUMPY_TYPE_TO_FTYPE[data_type.name],
-        )
-    )
-    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
-    fout.write(sname)
-    fout.seek((fout.tell() + 31) & -32)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) < 2:
-        logger.info(f"Usage: python {sys.argv[0]} <path> [arch]")
-        logger.info("Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'")
-        logger.info(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
-        sys.exit(1)
-
-    input_json = os.path.join(sys.argv[1], "adapter_config.json")
-    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
-    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
-
-    if os.path.exists(input_model):
-        model = torch.load(input_model, map_location="cpu")
-    else:
-        input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
-        # lazy import load_file only if lora is in safetensors format.
-        from safetensors.torch import load_file
-        model = load_file(input_model, device="cpu")
-
-    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
-
-    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
-        logger.error(f"Error: unsupported architecture {arch_name}")
-        sys.exit(1)
-
-    arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
-    name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
-
-    with open(input_json, "r") as f:
-        params = json.load(f)
-
-    if params["peft_type"] != "LORA":
-        logger.error(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
-        sys.exit(1)
-
-    if params["fan_in_fan_out"] is True:
-        logger.error("Error: param fan_in_fan_out is not supported")
-        sys.exit(1)
-
-    if params["bias"] is not None and params["bias"] != "none":
-        logger.error("Error: param bias is not supported")
-        sys.exit(1)
-
-    # TODO: these seem to be layers that have been trained but without lora.
-    # doesn't seem widely used but eventually should be supported
-    if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
-        logger.error("Error: param modules_to_save is not supported")
-        sys.exit(1)
-
-    with open(output_path, "wb") as fout:
-        fout.truncate()
-
-        write_file_header(fout, params)
-        for k, v in model.items():
-            orig_k = k
-            if k.endswith(".default.weight"):
-                k = k.replace(".default.weight", ".weight")
-            if k in ["llama_proj.weight", "llama_proj.bias"]:
-                continue
-            if k.endswith("lora_A.weight"):
-                if v.dtype != torch.float16 and v.dtype != torch.float32:
-                    v = v.float()
-                v = v.T
-            else:
-                v = v.float()
-
-            t = v.detach().numpy()
-
-            prefix = "base_model.model."
-            if k.startswith(prefix):
-                k = k[len(prefix) :]
-
-            lora_suffixes = (".lora_A.weight", ".lora_B.weight")
-            if k.endswith(lora_suffixes):
-                suffix = k[-len(lora_suffixes[0]):]
-                k = k[: -len(lora_suffixes[0])]
-            else:
-                logger.error(f"Error: unrecognized tensor name {orig_k}")
-                sys.exit(1)
-
-            tname = name_map.get_name(k)
-            if tname is None:
-                logger.error(f"Error: could not map tensor name {orig_k}")
-                logger.error(" Note: the arch parameter must be specified if the model is not llama")
-                sys.exit(1)
-
-            if suffix == ".lora_A.weight":
-                tname += ".weight.loraA"
-            elif suffix == ".lora_B.weight":
-                tname += ".weight.loraB"
-            else:
-                assert False
-
-            logger.info(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
-            write_tensor_header(fout, tname, t.shape, t.dtype)
-            t.tofile(fout)
-
-    logger.info(f"Converted {input_json} and {input_model} to {output_path}")
--- a/convert.py
+++ b/convert.py
@@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional

 import numpy as np
 from sentencepiece import SentencePieceProcessor
@@ -344,10 +344,47 @@ class Params:
        return params


+@dataclass
+class Metadata:
+    name: Optional[str] = None
+    author: Optional[str] = None
+    version: Optional[str] = None
+    url: Optional[str] = None
+    description: Optional[str] = None
+    licence: Optional[str] = None
+    source_url: Optional[str] = None
+    source_hf_repo: Optional[str] = None
+
+    @staticmethod
+    def load(metadata_path: Path) -> Metadata:
+        if metadata_path is None or not metadata_path.exists():
+            return Metadata()
+
+        with open(metadata_path, 'r') as file:
+            data = json.load(file)
+
+        # Create a new Metadata instance
+        metadata = Metadata()
+
+        # Assigning values to Metadata attributes if they exist in the JSON file
+        # This is based on LLM_KV_NAMES mapping in llama.cpp
+        metadata.name = data.get("general.name")
+        metadata.author = data.get("general.author")
+        metadata.version = data.get("general.version")
+        metadata.url = data.get("general.url")
+        metadata.description = data.get("general.description")
+        metadata.license = data.get("general.license")
+        metadata.source_url = data.get("general.source.url")
+        metadata.source_hf_repo = data.get("general.source.huggingface.repository")
+
+        return metadata
+
+
 #
 # vocab
 #

+
@runtime_checkable
 class BaseVocab(Protocol):
    tokenizer_model: ClassVar[str]
@@ -1066,21 +1103,42 @@ class OutputFile:
    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

-    def add_meta_arch(self, params: Params) -> None:
+    def add_meta_model(self, params: Params, metadata: Metadata) -> None:
+        # Metadata About The Model And Its Provenence
        name = "LLaMA"
-
-        # TODO: better logic to determine model name
-        if params.n_ctx == 4096:
-            name = "LLaMA v2"
+        if metadata is not None and metadata.name is not None:
+            name = metadata.name
        elif params.path_model is not None:
-            name = str(params.path_model.parent).split('/')[-1]
+            name = str(params.path_model.parent).split("/")[-1]
+        elif params.n_ctx == 4096:
+            # Heuristic detection of LLaMA v2 model
+            name = "LLaMA v2"

-        self.gguf.add_name                (name)
-        self.gguf.add_vocab_size          (params.n_vocab)
-        self.gguf.add_context_length      (params.n_ctx)
-        self.gguf.add_embedding_length    (params.n_embd)
-        self.gguf.add_block_count         (params.n_layer)
-        self.gguf.add_feed_forward_length (params.n_ff)
+        self.gguf.add_name(name)
+
+        if metadata is not None:
+            if metadata.author is not None:
+                self.gguf.add_author(metadata.author)
+            if metadata.version is not None:
+                self.gguf.add_version(metadata.version)
+            if metadata.url is not None:
+                self.gguf.add_url(metadata.url)
+            if metadata.description is not None:
+                self.gguf.add_description(metadata.description)
+            if metadata.licence is not None:
+                self.gguf.add_licence(metadata.licence)
+            if metadata.source_url is not None:
+                self.gguf.add_source_url(metadata.source_url)
+            if metadata.source_hf_repo is not None:
+                self.gguf.add_source_hf_repo(metadata.source_hf_repo)
+
+    def add_meta_arch(self, params: Params) -> None:
+        # Metadata About The Neural Architecture Itself
+        self.gguf.add_vocab_size(params.n_vocab)
+        self.gguf.add_context_length(params.n_ctx)
+        self.gguf.add_embedding_length(params.n_embd)
+        self.gguf.add_block_count(params.n_layer)
+        self.gguf.add_feed_forward_length(params.n_ff)
        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
        self.gguf.add_head_count          (params.n_head)
        self.gguf.add_head_count_kv       (params.n_head_kv)
@@ -1183,13 +1241,14 @@ class OutputFile:
    @staticmethod
    def write_vocab_only(
        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

        of = OutputFile(fname_out, endianess=endianess)

        # meta data
+        of.add_meta_model(params, metadata)
        of.add_meta_arch(params)
        of.add_meta_vocab(vocab)
        of.add_meta_special_vocab(svocab)
@@ -1216,12 +1275,14 @@ class OutputFile:
        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
+        metadata: Metadata = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

        of = OutputFile(fname_out, endianess=endianess)

        # meta data
+        of.add_meta_model(params, metadata)
        of.add_meta_arch(params)
        if isinstance(vocab, Vocab):
            of.add_meta_vocab(vocab)
@@ -1257,6 +1318,37 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
    raise ValueError(f"Unexpected combination of types: {name_to_type}")


+def model_parameter_count(model: LazyModel) -> int:
+    total_model_parameters = 0
+    for i, (name, lazy_tensor) in enumerate(model.items()):
+        sum_weights_in_tensor = 1
+        for dim in lazy_tensor.shape:
+            sum_weights_in_tensor *= dim
+        total_model_parameters += sum_weights_in_tensor
+    return total_model_parameters
+
+
+def model_parameter_count_rounded_notation(model_params_count: int) -> str:
+    if model_params_count > 1e12 :
+        # Trillions Of Parameters
+        scaled_model_params = model_params_count * 1e-12
+        scale_suffix = "T"
+    elif model_params_count > 1e9 :
+        # Billions Of Parameters
+        scaled_model_params = model_params_count * 1e-9
+        scale_suffix = "B"
+    elif model_params_count > 1e6 :
+        # Millions Of Parameters
+        scaled_model_params = model_params_count * 1e-6
+        scale_suffix = "M"
+    else:
+        # Thousands Of Parameters
+        scaled_model_params = model_params_count * 1e-3
+        scale_suffix = "K"
+
+    return f"{round(scaled_model_params)}{scale_suffix}"
+
+
 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
            for (name, tensor) in model.items()}
@@ -1436,13 +1528,35 @@ class VocabFactory:
        return vocab, special_vocab


-def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
-    namestr = {
-        GGMLFileType.AllF32:    "f32",
-        GGMLFileType.MostlyF16: "f16",
-        GGMLFileType.MostlyQ8_0:"q8_0",
+def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
+    quantization = {
+        GGMLFileType.AllF32:    "F32",
+        GGMLFileType.MostlyF16: "F16",
+        GGMLFileType.MostlyQ8_0: "Q8_0",
    }[file_type]
-    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
+
+    parameters = model_parameter_count_rounded_notation(model_params_count)
+
+    expert_count = ""
+    if params.n_experts is not None:
+        expert_count = f"{params.n_experts}x"
+
+    version = ""
+    if metadata is not None and metadata.version is not None:
+        version = f"-{metadata.version}"
+
+    name = "ggml-model"
+    if metadata is not None and metadata.name is not None:
+        name = metadata.name
+    elif params.path_model is not None:
+        name = params.path_model.name
+
+    return f"{name}{version}-{expert_count}{parameters}-{quantization}"
+
+
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
+    default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
+    ret = model_paths[0].parent / f"{default_filename}.gguf"
    if ret in model_paths:
        logger.error(
            f"Error: Default output path ({ret}) would overwrite the input. "
@@ -1480,17 +1594,30 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
    parser.add_argument("--verbose",      action="store_true",    help="increase output verbosity")
+    parser.add_argument("--metadata",     type=Path,              help="Specify the path for a metadata file")
+    parser.add_argument("--get-outfile",  action="store_true",    help="get calculated default outfile name")

    args = parser.parse_args(args_in)

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
-    elif args.dump_single or args.dump:
+    elif args.dump_single or args.dump or args.get_outfile:
        # Avoid printing anything besides the dump output
        logging.basicConfig(level=logging.WARNING)
    else:
        logging.basicConfig(level=logging.INFO)

+    metadata = Metadata.load(args.metadata)
+
+    if args.get_outfile:
+        model_plus = load_some_model(args.model)
+        params = Params.load(model_plus)
+        model   = convert_model_names(model_plus.model, params, args.skip_unknown)
+        model_params_count = model_parameter_count(model_plus.model)
+        ftype   = pick_output_type(model, args.outtype)
+        print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
+        return
+
    if args.no_vocab and args.vocab_only:
        raise ValueError("--vocab-only does not make sense with --no-vocab")

@@ -1504,6 +1631,9 @@ def main(args_in: list[str] | None = None) -> None:
    else:
        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)

+    model_params_count = model_parameter_count(model_plus.model)
+    logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
+
    if args.dump:
        do_dump_model(model_plus)
        return
@@ -1557,7 +1687,7 @@ def main(args_in: list[str] | None = None) -> None:
                f_norm_eps = 1e-5,
            )
        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
-                                    endianess=endianess, pad_vocab=args.pad_vocab)
+                                    endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
        logger.info(f"Wrote {outfile}")
        return

@@ -1570,13 +1700,13 @@ def main(args_in: list[str] | None = None) -> None:
    model   = convert_model_names(model, params, args.skip_unknown)
    ftype   = pick_output_type(model, args.outtype)
    model   = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_outfile(model_plus.paths, ftype)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)

    params.ftype = ftype
    logger.info(f"Writing {outfile}, format {ftype}")

    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
+                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
    logger.info(f"Wrote {outfile}")


--- a/docs/debugging-tests.md
+++ b/docs/debugging-tests.md
@@ -0,0 +1,88 @@
+# Debugging Tests Tips
+
+## How to run & debug a specific test without anything else to keep the feedback loop short?
+
+There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
+
+For example, running the following command will output an interactive list from which you can select a test. It takes this form:
+
+`debug-test.sh [OPTION]... <test_regex> <test_number>`
+
+It will then build & run in the debugger for you.
+
+```bash
+./scripts/debug-test.sh test-tokenizer
+
+# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
+>>> b main
+```
+
+For further reference use `debug-test.sh -h` to print help.
+
+&nbsp;
+
+### How does the script work?
+If you want to be able to use the concepts contained in the script separately, the important ones are briefly outlined below.
+
+#### Step 1: Reset and Setup folder context
+
+From base of this repository, let's create `build-ci-debug` as our build context.
+
+```bash
+rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
+```
+
+#### Step 2: Setup Build Environment and Compile Test Binaries
+
+Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.
+
+```bash
+cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
+make -j
+```
+
+#### Step 3.1: Identify Test Command for Debugging
+
+The output of this command will give you the command & arguments needed to run GDB.
+
+* `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
+* `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
+* `-V` : Verbose Mode
+
+```bash
+ctest -R "test-tokenizer" -V -N
+```
+
+This may return output similar to below (focusing on key lines to pay attention to):
+
+```bash
+...
+1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
+1: Working Directory: .
+Labels: main
+  Test  #1: test-tokenizer-0-llama-spm
+...
+4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf"
+4: Working Directory: .
+Labels: main
+  Test  #4: test-tokenizer-0-falcon
+...
+```
+
+So for test #1 we can tell these two pieces of relevant information:
+* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
+* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
+
+#### Step 3.2: Run GDB on test command
+
+Based on the ctest 'test command' report above we can then run a gdb session via this command below:
+
+```bash
+gdb --args ${Test Binary} ${Test GGUF Model}
+```
+
+Example:
+
+```bash
+gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
+```
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -49,4 +49,7 @@ else()
        add_subdirectory(server)
    endif()
    add_subdirectory(export-lora)
+    if (LLAMA_RPC)
+        add_subdirectory(rpc)
+    endif()
 endif()
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -300,14 +300,10 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    for (auto & image : params.image) {
+    if (prompt_contains_image(params.prompt)) {
        auto ctx_llava = llava_init_context(&params, model);

-        auto image_embed = load_image(ctx_llava, &params, image);
-        if (!image_embed) {
-            std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
-            return 1;
-        }
+        auto image_embed = load_image(ctx_llava, &params, "");

        // process the prompt
        process_prompt(ctx_llava, image_embed, &params, params.prompt);
@@ -316,7 +312,26 @@ int main(int argc, char ** argv) {
        llava_image_embed_free(image_embed);
        ctx_llava->model = NULL;
        llava_free(ctx_llava);
+    } else {
+        for (auto & image : params.image) {
+            auto ctx_llava = llava_init_context(&params, model);
+
+            auto image_embed = load_image(ctx_llava, &params, image);
+            if (!image_embed) {
+                std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
+                return 1;
+            }
+
+            // process the prompt
+            process_prompt(ctx_llava, image_embed, &params, params.prompt);
+
+            llama_print_timings(ctx_llava->ctx_llama);
+            llava_image_embed_free(image_embed);
+            ctx_llava->model = NULL;
+            llava_free(ctx_llava);
+        }
    }
+
    llama_free_model(model);

    return 0;
--- a/examples/perplexity/README.md
+++ b/examples/perplexity/README.md
@@ -7,6 +7,8 @@ Also note that finetunes typically result in a higher perplexity value even thou

 Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
 The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`).
+When numbers are listed all command line arguments and compilation options are left at their defaults unless noted otherwise.
+llama.cpp numbers are **not** directly comparable to those of other projects because the exact values depend strongly on the implementation details.

 By default only the mean perplexity value and the corresponding uncertainty is calculated.
 The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation.
@@ -32,7 +34,13 @@ In addition to the KL divergence the following statistics are calculated with `-

 ## LLaMA 3 8b Scoreboard

-Results are sorted by Kullback-Leibler divergence relative to FP16.
+| Revision | f364eb6f           |
+|:---------|:-------------------|
+| Backend  | CUDA               |
+| CPU      | AMD Epyc 7742      |
+| GPU      | 1x NVIDIA RTX 4090 |
+
+Results were generated using the CUDA backend and are sorted by Kullback-Leibler divergence relative to FP16.
 The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).

 | Quantization | imatrix | Model size [GiB] | PPL                    | ΔPPL                   | KLD                   | Mean Δp           | RMS Δp           |
@@ -89,6 +97,12 @@ K-quants score better on mean Δp than the legacy quants than e.g. KL divergence

 ## LLaMA 2 vs. LLaMA 3 Quantization comparison

+| Revision | f364eb6f           |
+|:---------|:-------------------|
+| Backend  | CUDA               |
+| CPU      | AMD Epyc 7742      |
+| GPU      | 1x NVIDIA RTX 4090 |
+
 | Metric          |          L2 7b q2_K |          L3 8b q2_K |        L2 7b q4_K_M |        L3 8b q4_K_M |          L2 7b q6_K |          L3 8b q6_K |          L2 7b q8_0 |          L3 8b q8_0 |
 |-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|
 | Mean PPL        | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 |
@@ -107,6 +121,50 @@ K-quants score better on mean Δp than the legacy quants than e.g. KL divergence
 | RMS Δp          |     9.762 ± 0.053 % |    21.421 ± 0.079 % |     3.252 ± 0.024 % |     5.519 ± 0.050 % |     1.339 ± 0.010 % |     2.295 ± 0.019 % |     0.618 ± 0.011 % |     1.198 ± 0.007 % |
 | Same top p      |    85.584 ± 0.086 % |    71.138 ± 0.119 % |    94.665 ± 0.055 % |    91.901 ± 0.072 % |    97.520 ± 0.038 % |    96.031 ± 0.051 % |    98.846 ± 0.026 % |    97.674 ± 0.040 % |

+## LLaMA 3 BF16 vs. FP16 comparison
+
+| Revision | 83330d8c      |
+|:---------|:--------------|
+| Backend  | CPU           |
+| CPU      | AMD Epyc 7742 |
+| GPU      | N/A           |
+
+Results were calculated with LLaMA 3 8b BF16 as `--kl-divergence-base` and LLaMA 3 8b FP16 as the `--model` for comparison.
+
+| Metric                         |                    Value |
+|--------------------------------|--------------------------|
+| Mean PPL(Q)                    |      6.227711 ± 0.037833 |
+| Mean PPL(base)                 |      6.225194 ± 0.037771 |
+| Cor(ln(PPL(Q)), ln(PPL(base))) |                  99.990% |
+| Mean ln(PPL(Q)/PPL(base))      |      0.000404 ± 0.000086 |
+| Mean PPL(Q)/PPL(base)          |      1.000404 ± 0.000086 |
+| Mean PPL(Q)-PPL(base)          |      0.002517 ± 0.000536 |
+| Mean    KLD                    |  0.00002515 ± 0.00000020 |
+| Maximum KLD                    |                 0.012206 |
+| 99.9%   KLD                    |                 0.000799 |
+| 99.0%   KLD                    |                 0.000222 |
+| 99.0%   KLD                    |                 0.000222 |
+| Median  KLD                    |                 0.000013 |
+| 10.0%   KLD                    |                -0.000002 |
+| 5.0%   KLD                     |                -0.000008 |
+| 1.0%   KLD                     |                -0.000023 |
+| Minimum KLD                    |                -0.000059 |
+| Mean    Δp                     | -0.0000745 ± 0.0003952 % |
+| Maximum Δp                     |                   4.186% |
+| 99.9%   Δp                     |                   1.049% |
+| 99.0%   Δp                     |                   0.439% |
+| 95.0%   Δp                     |                   0.207% |
+| 90.0%   Δp                     |                   0.125% |
+| 75.0%   Δp                     |                   0.029% |
+| Median  Δp                     |                   0.000% |
+| 25.0%   Δp                     |                  -0.030% |
+| 10.0%   Δp                     |                  -0.126% |
+| 5.0%   Δp                      |                  -0.207% |
+| 1.0%   Δp                      |                  -0.434% |
+| 0.1%   Δp                      |                  -1.016% |
+| Minimum Δp                     |                  -4.672% |
+| RMS Δp                         |          0.150 ± 0.001 % |
+| Same top p                     |         99.739 ± 0.013 % |

 ## Old Numbers

--- a/examples/rpc/CMakeLists.txt
+++ b/examples/rpc/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(rpc-server rpc-server.cpp)
+target_link_libraries(rpc-server PRIVATE ggml llama)
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -0,0 +1,74 @@
+## Overview
+
+The `rpc-server` allows  running `ggml` backend on a remote host.
+The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
+This can be used for distributed LLM inference with `llama.cpp` in the following way:
+
+```mermaid
+flowchart TD
+    rpcb---|TCP|srva
+    rpcb---|TCP|srvb
+    rpcb-.-|TCP|srvn
+    subgraph hostn[Host N]
+    srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
+    end
+    subgraph hostb[Host B]
+    srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
+    end
+    subgraph hosta[Host A]
+    srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
+    end
+    subgraph host[Main Host]
+    ggml[llama.cpp]---rpcb[RPC backend]
+    end
+    style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
+```
+
+Each host can run a different backend, e.g. one with CUDA and another with Metal.
+You can also run multiple `rpc-server` instances on the same host, each with a different backend.
+
+## Usage
+
+On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
+For example, to build the CUDA backend with RPC support:
+
+```bash
+mkdir build-rpc-cuda
+cd build-rpc-cuda
+cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
+cmake --build . --config Release
+```
+
+Then, start the `rpc-server` with the backend:
+
+```bash
+$ bin/rpc-server 0.0.0.0 50052
+create_backend: using CUDA backend
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no
+ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
+ggml_cuda_init: found 1 CUDA devices:
+  Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes
+Starting RPC server on 0.0.0.0:50052
+```
+
+When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
+```bash
+$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052
+```
+This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
+
+
+On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
+
+```bash
+mkdir build-rpc
+cd build-rpc
+cmake .. -DLLAMA_RPC=ON
+cmake --build . --config Release
+```
+
+Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
+
+```bash
+$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
+```
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -0,0 +1,70 @@
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include "ggml-rpc.h"
+#include <string>
+#include <stdio.h>
+
+static ggml_backend_t create_backend() {
+    ggml_backend_t backend = NULL;
+#ifdef GGML_USE_CUDA
+    fprintf(stderr, "%s: using CUDA backend\n", __func__);
+    backend = ggml_backend_cuda_init(0); // init device 0
+    if (!backend) {
+        fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+    }
+#elif GGML_USE_METAL
+    fprintf(stderr, "%s: using Metal backend\n", __func__);
+    backend = ggml_backend_metal_init();
+    if (!backend) {
+        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+    }
+#endif
+
+    // if there aren't GPU Backends fallback to CPU backend
+    if (!backend) {
+        fprintf(stderr, "%s: using CPU backend\n", __func__);
+        backend = ggml_backend_cpu_init();
+    }
+    return backend;
+}
+
+static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
+#ifdef GGML_USE_CUDA
+    ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
+#else
+    // TODO: implement for other backends
+    *free_mem = 1;
+    *total_mem = 1;
+#endif
+}
+
+int main(int argc, char * argv[]) {
+    if (argc < 3) {
+        fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
+        return 1;
+    }
+    const char * host = argv[1];
+    int port = std::stoi(argv[2]);
+    if (port <= 0 || port > 65535) {
+        fprintf(stderr, "Invalid port number: %d\n", port);
+        return 1;
+    }
+    ggml_backend_t backend = create_backend();
+    if (!backend) {
+        fprintf(stderr, "Failed to create backend\n");
+        return 1;
+    }
+    printf("Starting RPC server on %s:%d\n", host, port);
+    size_t free_mem, total_mem;
+    get_backend_memory(&free_mem, &total_mem);
+    std::string endpoint = std::string(host) + ":" + std::to_string(port);
+    start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
+    ggml_backend_free(backend);
+    return 0;
+}
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -48,7 +48,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
 - `--path`: Path from which to serve static files. Default: disabled
 - `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
 - `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
- `--embedding`: Enable embedding extraction. Default: disabled
+- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
 - `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
 - `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching).  Default: disabled
 - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -651,9 +651,6 @@ struct server_context {
    std::string              system_prompt;
    std::vector<llama_token> system_tokens;

-    std::string name_user;      // this should be the antiprompt
-    std::string name_assistant;
-
    // slots / clients
    std::vector<server_slot> slots;
    json default_generation_settings_for_props;
@@ -673,6 +670,8 @@ struct server_context {
            llama_free_model(model);
            model = nullptr;
        }
+
+        llama_batch_free(batch);
    }

    bool load_model(const gpt_params & params_) {
@@ -1098,15 +1097,11 @@ struct server_context {
        system_need_update = false;
    }

-    void system_prompt_set(const json & sys_props) {
-        system_prompt  = sys_props.value("prompt", "");
-        name_user      = sys_props.value("anti_prompt", "");
-        name_assistant = sys_props.value("assistant_name", "");
+    bool system_prompt_set(const std::string & sys_prompt) {
+        system_prompt = sys_prompt;

        LOG_VERBOSE("system prompt process", {
            {"system_prompt",  system_prompt},
-            {"name_user",      name_user},
-            {"name_assistant", name_assistant},
        });

        // release all slots
@@ -1115,6 +1110,7 @@ struct server_context {
        }

        system_need_update = true;
+        return true;
    }

    bool process_token(completion_token_output & result, server_slot & slot) {
@@ -1534,7 +1530,8 @@ struct server_context {
                    }

                    if (task.data.contains("system_prompt")) {
-                        system_prompt_set(task.data.at("system_prompt"));
+                        std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
+                        system_prompt_set(sys_prompt);

                        for (server_slot & slot : slots) {
                            slot.n_past    = 0;
@@ -2918,7 +2915,7 @@ int main(int argc, char ** argv) {
    server_params_parse(argc, argv, sparams, params);

    if (!sparams.system_prompt.empty()) {
-        ctx_server.system_prompt_set(json::parse(sparams.system_prompt));
+        ctx_server.system_prompt_set(sparams.system_prompt);
    }

    if (params.model_alias == "unknown") {
@@ -3407,8 +3404,7 @@ int main(int argc, char ** argv) {
    const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
        json data = {
-            { "user_name",                   ctx_server.name_user.c_str() },
-            { "assistant_name",              ctx_server.name_assistant.c_str() },
+            { "system_prompt",               ctx_server.system_prompt.c_str() },
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
            { "total_slots",                 ctx_server.params.n_parallel }
        };
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -887,6 +887,7 @@ async def oai_chat_completions(user_prompt,
                               base_path,
                               async_client,
                               debug=False,
+                               temperature=None,
                               model=None,
                               n_predict=None,
                               enable_streaming=None,
@@ -913,7 +914,8 @@ async def oai_chat_completions(user_prompt,
        "model": model,
        "max_tokens": n_predict,
        "stream": enable_streaming,
-        "seed": seed
+        "temperature": temperature if temperature is not None else 0.0,
+        "seed": seed,
    }
    if response_format is not None:
        payload['response_format'] = response_format
@@ -978,7 +980,8 @@ async def oai_chat_completions(user_prompt,
                max_tokens=n_predict,
                stream=enable_streaming,
                response_format=payload.get('response_format'),
-                seed=seed
+                seed=seed,
+                temperature=payload['temperature']
            )
        except openai.error.AuthenticationError as e:
            if expect_api_error is not None and expect_api_error:
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -371,7 +371,7 @@ static json oaicompat_completion_params_parse(
    llama_params["presence_penalty"]  = json_value(body,   "presence_penalty",  0.0);
    llama_params["seed"]              = json_value(body,   "seed",              LLAMA_DEFAULT_SEED);
    llama_params["stream"]            = json_value(body,   "stream",            false);
-    llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
+    llama_params["temperature"]       = json_value(body,   "temperature",       1.0);
    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);

    // Apply chat template to the list of messages
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1182,9 +1182,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
 static char * fmt_size(size_t size) {
    static char buffer[128];
    if (size >= 1024*1024) {
-        sprintf(buffer, "%zuM", size/1024/1024);
+        snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
    } else {
-        sprintf(buffer, "%zuK", size/1024);
+        snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
    }
    return buffer;
 }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2204,6 +2204,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                case GGML_UNARY_OP_RELU:
                    ggml_cuda_op_relu(ctx, dst);
                    break;
+                case GGML_UNARY_OP_SIGMOID:
+                    ggml_cuda_op_sigmoid(ctx, dst);
+                    break;
                case GGML_UNARY_OP_HARDSIGMOID:
                    ggml_cuda_op_hardsigmoid(ctx, dst);
                    break;
@@ -2710,12 +2713,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
 }

 GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
    switch (op->op) {
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
                case GGML_UNARY_OP_GELU:
                case GGML_UNARY_OP_SILU:
                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_SIGMOID:
                case GGML_UNARY_OP_HARDSIGMOID:
                case GGML_UNARY_OP_HARDSWISH:
                case GGML_UNARY_OP_GELU_QUICK:
@@ -2836,8 +2841,16 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_LEAKY_RELU:
-        case GGML_OP_FLASH_ATTN_EXT:
            return true;
+        case GGML_OP_FLASH_ATTN_EXT:
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+            return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
+#else
+            if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
+                return true;
+            }
+            return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA;
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
        default:
            return false;
    }
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@@ -321,6 +321,10 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {

 #define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA

+static bool fast_fp16_available(const int cc) {
+    return cc >= CC_PASCAL && cc != 610;
+}
+
 static bool fp16_mma_available(const int cc) {
    return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
 }
--- a/ggml-cuda/fattn-common.cuh
+++ b/ggml-cuda/fattn-common.cuh
@@ -0,0 +1,47 @@
+#define FATTN_KQ_STRIDE       256
+#define HALF_MAX_HALF         __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
+#define SOFTMAX_FTZ_THRESHOLD -20.0f                   // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
+
+template<int D, int parallel_blocks> // D == head size
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+__launch_bounds__(D, 1)
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+static __global__ void flash_attn_combine_results(
+        const float  * __restrict__ VKQ_parts,
+        const float2 * __restrict__ VKQ_meta,
+        float * __restrict__ dst) {
+    VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x;
+    VKQ_meta  += parallel_blocks   * gridDim.y*blockIdx.x;
+    dst       +=                 D * gridDim.y*blockIdx.x;
+
+    const int tid = threadIdx.x;
+    __builtin_assume(tid < D);
+
+    __shared__ float2 meta[parallel_blocks];
+    if (tid < 2*parallel_blocks) {
+        ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid];
+    }
+
+    __syncthreads();
+
+    float kqmax = meta[0].x;
+#pragma unroll
+    for (int l = 1; l < parallel_blocks; ++l) {
+        kqmax = max(kqmax, meta[l].x);
+    }
+
+    float VKQ_numerator   = 0.0f;
+    float VKQ_denominator = 0.0f;
+#pragma unroll
+    for (int l = 0; l < parallel_blocks; ++l) {
+        const float diff = meta[l].x - kqmax;
+        const float KQ_max_scale = expf(diff);
+        const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
+        *((uint32_t *) &KQ_max_scale) &= ftz_mask;
+
+        VKQ_numerator   += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid];
+        VKQ_denominator += KQ_max_scale * meta[l].y;
+    }
+
+    dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
+}
--- a/ggml-cuda/fattn-vec-f16.cu
+++ b/ggml-cuda/fattn-vec-f16.cu
@@ -0,0 +1,430 @@
+#include "common.cuh"
+#include "fattn-common.cuh"
+#include "fattn-vec-f16.cuh"
+
+template<int D, int ncols, int parallel_blocks> // D == head size
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+__launch_bounds__(D, 1)
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+static __global__ void flash_attn_vec_ext_f16(
+        const char * __restrict__ Q,
+        const char * __restrict__ K,
+        const char * __restrict__ V,
+        const char * __restrict__ mask,
+        float      * __restrict__ dst,
+        float2     * __restrict__ dst_meta,
+        const float scale,
+        const float max_bias,
+        const float m0,
+        const float m1,
+        const uint32_t n_head_log2,
+        const int ne00,
+        const int ne01,
+        const int ne02,
+        const int ne03,
+        const int ne10,
+        const int ne11,
+        const int ne12,
+        const int ne13,
+        const int ne31,
+        const int nb31,
+        const int nb01,
+        const int nb02,
+        const int nb03,
+        const int nb11,
+        const int nb12,
+        const int nb13,
+        const int ne0,
+        const int ne1,
+        const int ne2,
+        const int ne3) {
+#if FP16_AVAILABLE
+    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
+
+    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
+    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+
+    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
+    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
+    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
+    const half   * V_h   = (const half   *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
+    const half   * maskh = (const half   *)  mask + ne11*ic0;
+
+    const int stride_KV  = nb11 / sizeof(half);
+    const int stride_KV2 = nb11 / sizeof(half2);
+
+    half  slopeh = __float2half(1.0f);
+
+    // ALiBi
+    if (max_bias > 0.0f) {
+        const int h = blockIdx.y;
+
+        const float base = h < n_head_log2 ? m0 : m1;
+        const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+        slopeh = __float2half(powf(base, exph));
+    }
+
+    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
+    constexpr int nwarps = D / WARP_SIZE;
+    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
+    __builtin_assume(tid < D);
+
+    __shared__ half KQ[ncols*D];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        KQ[j*D + tid] = -HALF_MAX_HALF;
+    }
+    half2 * KQ2 = (half2 *) KQ;
+
+    half kqmax[ncols];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        kqmax[j] = -HALF_MAX_HALF;
+    }
+    half kqsum[ncols] = {0.0f};
+
+    __shared__ half kqmax_shared[ncols][WARP_SIZE];
+    __shared__ half kqsum_shared[ncols][WARP_SIZE];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        if (threadIdx.y == 0) {
+            kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
+            kqsum_shared[j][threadIdx.x] = 0.0f;
+        }
+    }
+    __syncthreads();
+
+    // Convert Q to half2 and store in registers:
+    half2 Q_h2[ncols][D/(2*WARP_SIZE)];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+#pragma unroll
+        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
+            Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
+        }
+    }
+
+    half2 VKQ[ncols] = {{0.0f, 0.0f}};
+
+    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
+    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
+        // Calculate KQ tile and keep track of new maximum KQ values:
+
+        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
+        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
+        // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
+        half kqmax_new = kqmax[0];
+        half kqmax_new_arr[ncols];
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            kqmax_new_arr[j] = kqmax[j];
+        }
+
+#pragma unroll
+        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
+            const int i_KQ = i_KQ_0 + threadIdx.y;
+
+            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
+                break;
+            }
+
+            half2 sum2[ncols] = {{0.0f, 0.0f}};
+#pragma unroll
+            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
+                const int k_KQ = k_KQ_0 + threadIdx.x;
+
+                const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
+#pragma unroll
+                for (int j = 0; j < ncols; ++j) {
+                    sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
+                }
+            }
+
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+                sum2[j] = warp_reduce_sum(sum2[j]);
+                half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
+                sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
+
+                if (ncols == 1) {
+                    kqmax_new        = ggml_cuda_hmax(kqmax_new,        sum);
+                } else {
+                    kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
+                }
+
+                if (threadIdx.x == 0) {
+                    KQ[j*D + i_KQ] = sum;
+                }
+            }
+        }
+
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
+
+            kqmax_new_j = warp_reduce_max(kqmax_new_j);
+            if (threadIdx.x == 0) {
+                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
+            }
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            half kqmax_new_j = kqmax_shared[j][threadIdx.x];
+            kqmax_new_j = warp_reduce_max(kqmax_new_j);
+
+            const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
+            kqmax[j] = kqmax_new_j;
+
+            const half val = hexp(KQ[j*D + tid] - kqmax[j]);
+            kqsum[j] = kqsum[j]*KQ_max_scale + val;
+            KQ[j*D + tid] = val;
+
+            VKQ[j] *= __half2half2(KQ_max_scale);
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int k0 = 0; k0 < D; k0 += 2) {
+            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
+                break;
+            }
+
+            half2 V_k;
+            reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
+            reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+                VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
+            }
+        }
+
+        __syncthreads();
+    }
+
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        kqsum[j] = warp_reduce_sum(kqsum[j]);
+        if (threadIdx.x == 0) {
+            kqsum_shared[j][threadIdx.y] = kqsum[j];
+        }
+    }
+
+    __syncthreads();
+
+#pragma unroll
+    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
+        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
+        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
+
+        half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
+        if (parallel_blocks == 1) {
+            dst_val /= kqsum[j_VKQ];
+        }
+        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
+        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
+    }
+
+    if (parallel_blocks != 1 && tid != 0) {
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
+        }
+    }
+#else
+   NO_DEVICE_CODE;
+#endif // FP16_AVAILABLE
+}
+
+template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
+        const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
+        ggml_cuda_pool & pool, cudaStream_t main_stream
+) {
+    ggml_cuda_pool_alloc<float>  dst_tmp(pool);
+    ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
+
+    if (parallel_blocks > 1) {
+        dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
+        dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
+    }
+
+    constexpr int  nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
+    const     dim3 block_dim(WARP_SIZE, nwarps, 1);
+    const     dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
+    const     int  shmem = 0;
+
+    float scale    = 1.0f;
+    float max_bias = 0.0f;
+
+    memcpy(&scale,    (float *) KQV->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
+
+    const uint32_t n_head      = Q->ne[2];
+    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
+
+    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+    flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
+        <<<blocks_num, block_dim, shmem, main_stream>>> (
+                (const char *) Q->data,
+                (const char *) K->data,
+                (const char *) V->data,
+                mask ? ((const char *) mask->data) : nullptr,
+                parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
+                scale, max_bias, m0, m1, n_head_log2,
+                Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
+                K->ne[0], K->ne[1], K->ne[2], K->ne[3],
+                mask ? mask->ne[1] : 0, mask ?  mask->nb[1] : 0,
+                Q->nb[1], Q->nb[2], Q->nb[3],
+                K->nb[1], K->nb[2], K->nb[3],
+                KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
+                );
+    CUDA_CHECK(cudaGetLastError());
+
+    if (parallel_blocks == 1) {
+        return;
+    }
+
+    const dim3 block_dim_combine(D, 1, 1);
+    const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
+    const int  shmem_combine = 0;
+
+    flash_attn_combine_results<D, parallel_blocks>
+        <<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
+        (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
+    CUDA_CHECK(cudaGetLastError());
+}
+
+void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * Q = dst->src[0];
+    const ggml_tensor * K = dst->src[1];
+    const ggml_tensor * V = dst->src[2];
+
+    const ggml_tensor * mask = dst->src[3];
+
+    ggml_tensor * KQV = dst;
+
+    const int32_t precision = KQV->op_params[2];
+    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
+
+    constexpr int cols_per_block = 1;
+    constexpr int parallel_blocks = 4;
+    switch (Q->ne[0]) {
+        case 64:
+            launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+            break;
+        case 128:
+            launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+            break;
+        case 256:
+            launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+}
+
+void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * Q = dst->src[0];
+    const ggml_tensor * K = dst->src[1];
+    const ggml_tensor * V = dst->src[2];
+
+    const ggml_tensor * mask = dst->src[3];
+
+    ggml_tensor * KQV = dst;
+
+    const int32_t precision = KQV->op_params[2];
+    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
+    GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
+
+    if (Q->ne[1] == 1) {
+        constexpr int cols_per_block = 1;
+        constexpr int parallel_blocks = 4;
+        switch (Q->ne[0]) {
+            case 64:
+                launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            case 128:
+                launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            default:
+                GGML_ASSERT(false);
+                break;
+        }
+        return;
+    }
+
+    if (Q->ne[1] == 2) {
+        constexpr int cols_per_block = 2;
+        constexpr int parallel_blocks = 4;
+        switch (Q->ne[0]) {
+            case 64:
+                launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            case 128:
+                launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            default:
+                GGML_ASSERT(false);
+                break;
+        }
+        return;
+    }
+
+    if (Q->ne[1] <= 4) {
+        constexpr int cols_per_block = 4;
+        constexpr int parallel_blocks = 4;
+        switch (Q->ne[0]) {
+            case 64:
+                launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            case 128:
+                launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            default:
+                GGML_ASSERT(false);
+                break;
+        }
+        return;
+    }
+
+    if (Q->ne[1] <= 8) {
+        constexpr int cols_per_block = 8;
+        constexpr int parallel_blocks = 4;
+        switch (Q->ne[0]) {
+            case 64:
+                launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            case 128:
+                launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            default:
+                GGML_ASSERT(false);
+                break;
+        }
+        return;
+    }
+
+    constexpr int cols_per_block = 8;
+    constexpr int parallel_blocks = 1;
+    switch (Q->ne[0]) {
+        case 64:
+            launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+            break;
+        case 128:
+            launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+}
--- a/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml-cuda/fattn-vec-f16.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml-cuda/fattn-vec-f32.cu
+++ b/ggml-cuda/fattn-vec-f32.cu
@@ -0,0 +1,384 @@
+#include "common.cuh"
+#include "fattn-common.cuh"
+#include "fattn-vec-f32.cuh"
+
+template<int D, int ncols, int parallel_blocks> // D == head size
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+__launch_bounds__(D, 1)
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+static __global__ void flash_attn_vec_ext_f32(
+        const char * __restrict__ Q,
+        const char * __restrict__ K,
+        const char * __restrict__ V,
+        const char * __restrict__ mask,
+        float      * __restrict__ dst,
+        float2     * __restrict__ dst_meta,
+        const float scale,
+        const float max_bias,
+        const float m0,
+        const float m1,
+        const uint32_t n_head_log2,
+        const int ne00,
+        const int ne01,
+        const int ne02,
+        const int ne03,
+        const int ne10,
+        const int ne11,
+        const int ne12,
+        const int ne13,
+        const int ne31,
+        const int nb31,
+        const int nb01,
+        const int nb02,
+        const int nb03,
+        const int nb11,
+        const int nb12,
+        const int nb13,
+        const int ne0,
+        const int ne1,
+        const int ne2,
+        const int ne3) {
+    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
+
+    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
+    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+
+    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
+    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
+    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
+    const half   * V_h   = (const half   *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
+    const half   * maskh = (const half   *)  mask + ne11*ic0;
+
+    const int stride_KV  = nb11 / sizeof(half);
+    const int stride_KV2 = nb11 / sizeof(half2);
+
+    float slope = 1.0f;
+
+    // ALiBi
+    if (max_bias > 0.0f) {
+        const int h = blockIdx.y;
+
+        const float base = h < n_head_log2 ? m0 : m1;
+        const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+        slope = powf(base, exph);
+    }
+
+    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
+    constexpr int nwarps = D / WARP_SIZE;
+    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
+    __builtin_assume(tid < D);
+
+    __shared__ float KQ[ncols*D];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        KQ[j*D + tid] = -FLT_MAX/2.0f;
+    }
+
+    float kqmax[ncols];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        kqmax[j] = -FLT_MAX/2.0f;
+    }
+    float kqsum[ncols] = {0.0f};
+
+    __shared__ float kqmax_shared[ncols][WARP_SIZE];
+    __shared__ float kqsum_shared[ncols][WARP_SIZE];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        if (threadIdx.y == 0) {
+            kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
+            kqsum_shared[j][threadIdx.x] = 0.0f;
+        }
+    }
+    __syncthreads();
+
+    // Convert Q to half2 and store in registers:
+    float2 Q_h2[ncols][D/(2*WARP_SIZE)];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+#pragma unroll
+        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            Q_h2[j][i0/WARP_SIZE]    = Q_f2[j*(nb01/sizeof(float2)) + i];
+            Q_h2[j][i0/WARP_SIZE].x *= scale;
+            Q_h2[j][i0/WARP_SIZE].y *= scale;
+        }
+    }
+
+    float VKQ[ncols] = {0.0f};
+
+    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
+    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
+        // Calculate KQ tile and keep track of new maximum KQ values:
+
+        float kqmax_new_arr[ncols];
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            kqmax_new_arr[j] = kqmax[j];
+        }
+
+#pragma unroll
+        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
+            const int i_KQ = i_KQ_0 + threadIdx.y;
+
+            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
+                break;
+            }
+
+            float sum[ncols] = {0.0f};
+#pragma unroll
+            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
+                const int k_KQ = k_KQ_0 + threadIdx.x;
+
+                const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
+#pragma unroll
+                for (int j = 0; j < ncols; ++j) {
+                    sum[j] +=  __low2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].x;
+                    sum[j] += __high2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].y;
+                }
+            }
+
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+                sum[j] = warp_reduce_sum(sum[j]);
+                sum[j] += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
+
+                kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum[j]);
+
+                if (threadIdx.x == 0) {
+                    KQ[j*D + i_KQ] = sum[j];
+                }
+            }
+        }
+
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            float kqmax_new_j = kqmax_new_arr[j];
+
+            kqmax_new_j = warp_reduce_max(kqmax_new_j);
+            if (threadIdx.x == 0) {
+                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
+            }
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            float kqmax_new_j = kqmax_shared[j][threadIdx.x];
+            kqmax_new_j = warp_reduce_max(kqmax_new_j);
+
+            const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
+            kqmax[j] = kqmax_new_j;
+
+            const float val = expf(KQ[j*D + tid] - kqmax[j]);
+            kqsum[j] = kqsum[j]*KQ_max_scale + val;
+            KQ[j*D + tid] = val;
+
+            VKQ[j] *= KQ_max_scale;
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int k = 0; k < D; ++k) {
+            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) {
+                break;
+            }
+
+            const float V_ki = __half2float(V_h[(k_VKQ_0 + k)*stride_KV + tid]);
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+                VKQ[j] += V_ki*KQ[j*D + k];
+            }
+        }
+
+        __syncthreads();
+    }
+
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        kqsum[j] = warp_reduce_sum(kqsum[j]);
+        if (threadIdx.x == 0) {
+            kqsum_shared[j][threadIdx.y] = kqsum[j];
+        }
+    }
+
+    __syncthreads();
+
+#pragma unroll
+    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
+        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
+        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
+
+        float dst_val = VKQ[j_VKQ];
+        if (parallel_blocks == 1) {
+            dst_val /= kqsum[j_VKQ];
+        }
+        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
+        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
+    }
+
+    if (parallel_blocks != 1 && tid != 0) {
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
+        }
+    }
+}
+
+template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f32(
+        const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
+        ggml_cuda_pool & pool, cudaStream_t main_stream
+) {
+    ggml_cuda_pool_alloc<float>  dst_tmp(pool);
+    ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
+
+    if (parallel_blocks > 1) {
+        dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
+        dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
+    }
+
+    constexpr int  nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
+    const     dim3 block_dim(WARP_SIZE, nwarps, 1);
+    const     dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
+    const     int  shmem = 0;
+
+    float scale    = 1.0f;
+    float max_bias = 0.0f;
+
+    memcpy(&scale,    (float *) KQV->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
+
+    const uint32_t n_head      = Q->ne[2];
+    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
+
+    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+    flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>
+        <<<blocks_num, block_dim, shmem, main_stream>>> (
+                (const char *) Q->data,
+                (const char *) K->data,
+                (const char *) V->data,
+                mask ? ((const char *) mask->data) : nullptr,
+                parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
+                scale, max_bias, m0, m1, n_head_log2,
+                Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
+                K->ne[0], K->ne[1], K->ne[2], K->ne[3],
+                mask ? mask->ne[1] : 0, mask ?  mask->nb[1] : 0,
+                Q->nb[1], Q->nb[2], Q->nb[3],
+                K->nb[1], K->nb[2], K->nb[3],
+                KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
+                );
+    CUDA_CHECK(cudaGetLastError());
+
+    if (parallel_blocks == 1) {
+        return;
+    }
+
+    const dim3 block_dim_combine(D, 1, 1);
+    const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
+    const int  shmem_combine = 0;
+
+    flash_attn_combine_results<D, parallel_blocks>
+        <<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
+        (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
+    CUDA_CHECK(cudaGetLastError());
+}
+
+void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * Q = dst->src[0];
+    const ggml_tensor * K = dst->src[1];
+    const ggml_tensor * V = dst->src[2];
+
+    const ggml_tensor * mask = dst->src[3];
+
+    ggml_tensor * KQV = dst;
+
+    GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
+
+    if (Q->ne[1] == 1) {
+        constexpr int cols_per_block = 1;
+        constexpr int parallel_blocks = 4;
+        switch (Q->ne[0]) {
+            case 64:
+                launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            case 128:
+                launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            default:
+                GGML_ASSERT(false);
+                break;
+        }
+        return;
+    }
+
+    if (Q->ne[1] == 2) {
+        constexpr int cols_per_block = 2;
+        constexpr int parallel_blocks = 4;
+        switch (Q->ne[0]) {
+            case 64:
+                launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            case 128:
+                launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            default:
+                GGML_ASSERT(false);
+                break;
+        }
+        return;
+    }
+
+    if (Q->ne[1] <= 4) {
+        constexpr int cols_per_block = 4;
+        constexpr int parallel_blocks = 4;
+        switch (Q->ne[0]) {
+            case 64:
+                launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            case 128:
+                launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            default:
+                GGML_ASSERT(false);
+                break;
+        }
+        return;
+    }
+
+    if (Q->ne[1] <= 8) {
+        constexpr int cols_per_block = 8;
+        constexpr int parallel_blocks = 4;
+        switch (Q->ne[0]) {
+            case 64:
+                launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            case 128:
+                launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            default:
+                GGML_ASSERT(false);
+                break;
+        }
+        return;
+    }
+
+    constexpr int cols_per_block = 8;
+    constexpr int parallel_blocks = 1;
+    switch (Q->ne[0]) {
+        case 64:
+            launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+            break;
+        case 128:
+            launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+}
--- a/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml-cuda/fattn-vec-f32.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml-cuda/fattn.cu
+++ b/ggml-cuda/fattn.cu
@@ -1,4 +1,7 @@
 #include "common.cuh"
+#include "fattn-common.cuh"
+#include "fattn-vec-f16.cuh"
+#include "fattn-vec-f32.cuh"
 #include "fattn.cuh"

 #include <cstdint>
@@ -7,251 +10,6 @@
 #include <mma.h>
 #endif

-#define FATTN_KQ_STRIDE       256
-#define HALF_MAX_HALF         __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
-#define SOFTMAX_FTZ_THRESHOLD -20.0f                   // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
-
-template<int D, int ncols, int parallel_blocks> // D == head size
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
-__launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
-static __global__ void flash_attn_vec_ext_f16(
-        const char * __restrict__ Q,
-        const char * __restrict__ K,
-        const char * __restrict__ V,
-        const char * __restrict__ mask,
-        float      * __restrict__ dst,
-        float2     * __restrict__ dst_meta,
-        const float scale,
-        const float max_bias,
-        const float m0,
-        const float m1,
-        const uint32_t n_head_log2,
-        const int ne00,
-        const int ne01,
-        const int ne02,
-        const int ne03,
-        const int ne10,
-        const int ne11,
-        const int ne12,
-        const int ne13,
-        const int ne31,
-        const int nb31,
-        const int nb01,
-        const int nb02,
-        const int nb03,
-        const int nb11,
-        const int nb12,
-        const int nb13,
-        const int ne0,
-        const int ne1,
-        const int ne2,
-        const int ne3) {
-#if FP16_AVAILABLE
-    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
-
-    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
-    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
-
-    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
-    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
-    const half   * V_h   = (const half   *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
-    const half   * maskh = (const half   *)  mask + ne11*ic0;
-
-    const int stride_KV  = nb11 / sizeof(half);
-    const int stride_KV2 = nb11 / sizeof(half2);
-
-    half slopeh = __float2half(1.0f);
-
-    // ALiBi
-    if (max_bias > 0.0f) {
-        const int h = blockIdx.y;
-
-        const float base = h < n_head_log2 ? m0 : m1;
-        const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
-
-        slopeh = __float2half(powf(base, exph));
-    }
-
-    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
-    constexpr int nwarps = D / WARP_SIZE;
-    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
-    __builtin_assume(tid < D);
-
-    __shared__ half KQ[ncols*D];
-#pragma unroll
-    for (int j = 0; j < ncols; ++j) {
-        KQ[j*D + tid] = -HALF_MAX_HALF;
-    }
-    half2 * KQ2 = (half2 *) KQ;
-
-    half kqmax[ncols];
-#pragma unroll
-    for (int j = 0; j < ncols; ++j) {
-        kqmax[j] = -HALF_MAX_HALF;
-    }
-    half kqsum[ncols] = {0.0f};
-
-    __shared__ half kqmax_shared[ncols][WARP_SIZE];
-    __shared__ half kqsum_shared[ncols][WARP_SIZE];
-#pragma unroll
-    for (int j = 0; j < ncols; ++j) {
-        if (threadIdx.y == 0) {
-            kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
-            kqsum_shared[j][threadIdx.x] = 0.0f;
-        }
-    }
-    __syncthreads();
-
-    // Convert Q to half2 and store in registers:
-    half2 Q_h2[ncols][D/(2*WARP_SIZE)];
-#pragma unroll
-    for (int j = 0; j < ncols; ++j) {
-#pragma unroll
-        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
-            const int i = i0 + threadIdx.x;
-
-            const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
-            Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
-        }
-    }
-
-    half2 VKQ[ncols] = {{0.0f, 0.0f}};
-
-    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
-    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
-        // Calculate KQ tile and keep track of new maximum KQ values:
-
-        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
-        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
-        // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
-        half kqmax_new = kqmax[0];
-        half kqmax_new_arr[ncols];
-#pragma unroll
-        for (int j = 0; j < ncols; ++j) {
-            kqmax_new_arr[j] = kqmax[j];
-        }
-
-#pragma unroll
-        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
-            const int i_KQ = i_KQ_0 + threadIdx.y;
-
-            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
-                break;
-            }
-
-            half2 sum2[ncols] = {{0.0f, 0.0f}};
-#pragma unroll
-            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
-                const int k_KQ = k_KQ_0 + threadIdx.x;
-
-                const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
-#pragma unroll
-                for (int j = 0; j < ncols; ++j) {
-                    sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
-                }
-            }
-
-#pragma unroll
-            for (int j = 0; j < ncols; ++j) {
-                sum2[j] = warp_reduce_sum(sum2[j]);
-                half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
-                sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
-
-                if (ncols == 1) {
-                    kqmax_new        = ggml_cuda_hmax(kqmax_new,        sum);
-                } else {
-                    kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
-                }
-
-                if (threadIdx.x == 0) {
-                    KQ[j*D + i_KQ] = sum;
-                }
-            }
-        }
-
-#pragma unroll
-        for (int j = 0; j < ncols; ++j) {
-            half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
-
-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
-            if (threadIdx.x == 0) {
-                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
-            }
-        }
-
-        __syncthreads();
-
-#pragma unroll
-        for (int j = 0; j < ncols; ++j) {
-            half kqmax_new_j = kqmax_shared[j][threadIdx.x];
-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
-
-            const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
-            kqmax[j] = kqmax_new_j;
-
-            const half val = hexp(KQ[j*D + tid] - kqmax[j]);
-            kqsum[j] = kqsum[j]*KQ_max_scale + val;
-            KQ[j*D + tid] = val;
-
-            VKQ[j] *= __half2half2(KQ_max_scale);
-        }
-
-        __syncthreads();
-
-#pragma unroll
-        for (int k0 = 0; k0 < D; k0 += 2) {
-            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
-                break;
-            }
-
-            half2 V_k;
-            reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
-            reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
-#pragma unroll
-            for (int j = 0; j < ncols; ++j) {
-                VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
-            }
-        }
-
-        __syncthreads();
-    }
-
-#pragma unroll
-    for (int j = 0; j < ncols; ++j) {
-        kqsum[j] = warp_reduce_sum(kqsum[j]);
-        if (threadIdx.x == 0) {
-            kqsum_shared[j][threadIdx.y] = kqsum[j];
-        }
-    }
-
-    __syncthreads();
-
-#pragma unroll
-    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
-        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
-        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
-
-        half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
-        if (parallel_blocks == 1) {
-            dst_val /= kqsum[j_VKQ];
-        }
-        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
-        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
-    }
-
-    if (parallel_blocks != 1 && tid != 0) {
-#pragma unroll
-        for (int j = 0; j < ncols; ++j) {
-            dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
-        }
-    }
-#else
-   NO_DEVICE_CODE;
-#endif // FP16_AVAILABLE
-}
-
 // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
 template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@@ -655,54 +413,6 @@ static __global__ void flash_attn_ext_f16(
 #endif // FP16_MMA_AVAILABLE
 }

-template<int D, int parallel_blocks> // D == head size
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
-__launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
-static __global__ void flash_attn_combine_results(
-        const float  * __restrict__ VKQ_parts,
-        const float2 * __restrict__ VKQ_meta,
-        float * __restrict__ dst) {
-#if FP16_AVAILABLE
-    VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x;
-    VKQ_meta  += parallel_blocks   * gridDim.y*blockIdx.x;
-    dst       +=                 D * gridDim.y*blockIdx.x;
-
-    const int tid = threadIdx.x;
-    __builtin_assume(tid < D);
-
-    __shared__ float2 meta[parallel_blocks];
-    if (tid < 2*parallel_blocks) {
-        ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid];
-    }
-
-    __syncthreads();
-
-    float kqmax = meta[0].x;
-#pragma unroll
-    for (int l = 1; l < parallel_blocks; ++l) {
-        kqmax = max(kqmax, meta[l].x);
-    }
-
-    float VKQ_numerator   = 0.0f;
-    float VKQ_denominator = 0.0f;
-#pragma unroll
-    for (int l = 0; l < parallel_blocks; ++l) {
-        const float diff = meta[l].x - kqmax;
-        const float KQ_max_scale = expf(diff);
-        const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
-        *((uint32_t *) &KQ_max_scale) &= ftz_mask;
-
-        VKQ_numerator   += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid];
-        VKQ_denominator += KQ_max_scale * meta[l].y;
-    }
-
-    dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
-#else
-   NO_DEVICE_CODE;
-#endif // FP16_AVAILABLE
-}
-
 constexpr int get_max_power_of_2(int x) {
    return x % 2 == 0 ? 2*get_max_power_of_2(x/2) : 1;
 }
@@ -727,66 +437,6 @@ static_assert(get_VKQ_stride( 80, 1, 16) ==  16, "Test failed.");
 static_assert(get_VKQ_stride( 80, 2, 16) ==  16, "Test failed.");
 static_assert(get_VKQ_stride( 80, 4, 16) ==  16, "Test failed.");

-template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
-        const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
-        ggml_cuda_pool & pool, cudaStream_t main_stream
-) {
-    ggml_cuda_pool_alloc<float>  dst_tmp(pool);
-    ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
-
-    if (parallel_blocks > 1) {
-        dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
-        dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
-    }
-
-    constexpr int  nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
-    const     dim3 block_dim(WARP_SIZE, nwarps, 1);
-    const     dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
-    const     int  shmem = 0;
-
-    float scale    = 1.0f;
-    float max_bias = 0.0f;
-
-    memcpy(&scale,    (float *) KQV->op_params + 0, sizeof(float));
-    memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
-
-    const uint32_t n_head      = Q->ne[2];
-    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
-
-    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-
-    flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
-        <<<blocks_num, block_dim, shmem, main_stream>>> (
-                (const char *) Q->data,
-                (const char *) K->data,
-                (const char *) V->data,
-                mask ? ((const char *) mask->data) : nullptr,
-                parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
-                scale, max_bias, m0, m1, n_head_log2,
-                Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
-                K->ne[0], K->ne[1], K->ne[2], K->ne[3],
-                mask ? mask->ne[1] : 0, mask ?  mask->nb[1] : 0,
-                Q->nb[1], Q->nb[2], Q->nb[3],
-                K->nb[1], K->nb[2], K->nb[3],
-                KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
-                );
-    CUDA_CHECK(cudaGetLastError());
-
-    if (parallel_blocks == 1) {
-        return;
-    }
-
-    const dim3 block_dim_combine(D, 1, 1);
-    const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
-    const int  shmem_combine = 0;
-
-    flash_attn_combine_results<D, parallel_blocks>
-        <<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
-        (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
-    CUDA_CHECK(cudaGetLastError());
-}
-
 template <int D, int cols_per_block, int nwarps, int parallel_blocks, typename KQ_acc_t> void launch_fattn_f16_impl(
        const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
        ggml_cuda_pool & pool, cudaStream_t main_stream
@@ -891,95 +541,22 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst

    const int32_t precision = KQV->op_params[2];

+    if (!fast_fp16_available(cc)) {
+        ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
+        return;
+    }
+
    if (!fp16_mma_available(cc)) {
-        GGML_ASSERT(precision == GGML_PREC_DEFAULT);
-        GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
-
-        if (Q->ne[1] == 1) {
-            constexpr int cols_per_block = 1;
-            constexpr int parallel_blocks = 4;
-            switch (Q->ne[0]) {
-                case 64:
-                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                case 128:
-                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                default:
-                    GGML_ASSERT(false);
-                    break;
-            }
-            return;
-        }
-
-        if (Q->ne[1] == 2) {
-            constexpr int cols_per_block = 2;
-            constexpr int parallel_blocks = 4;
-            switch (Q->ne[0]) {
-                case 64:
-                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                case 128:
-                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                default:
-                    GGML_ASSERT(false);
-                    break;
-            }
-            return;
-        }
-
-        if (Q->ne[1] <= 4) {
-            constexpr int cols_per_block = 4;
-            constexpr int parallel_blocks = 4;
-            switch (Q->ne[0]) {
-                case 64:
-                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                case 128:
-                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                default:
-                    GGML_ASSERT(false);
-                    break;
-            }
-            return;
-        }
-
-        if (Q->ne[1] <= 8) {
-            constexpr int cols_per_block = 8;
-            constexpr int parallel_blocks = 4;
-            switch (Q->ne[0]) {
-                case 64:
-                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                case 128:
-                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                default:
-                    GGML_ASSERT(false);
-                    break;
-            }
-            return;
-        }
-
-        constexpr int cols_per_block = 8;
-        constexpr int parallel_blocks = 1;
-        switch (Q->ne[0]) {
-            case 64:
-                launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                break;
-            case 128:
-                launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                break;
-            default:
-                GGML_ASSERT(false);
-                break;
-        }
+        ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst);
        return;
    }

    if (precision != GGML_PREC_DEFAULT) {
+        if (Q->ne[1] == 1 && (Q->ne[0] == 64 || Q->ne[0] == 128)) {
+            ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
+            return;
+        }
+
        if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
            constexpr int cols_per_block = 16;
            constexpr int nwarps         =  4;
@@ -1037,22 +614,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    }

    if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
-        constexpr int cols_per_block = 1;
-        constexpr int parallel_blocks = 4;
-        switch (Q->ne[0]) {
-            case 64:
-                launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                break;
-            case 128:
-                launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                break;
-            case 256:
-                launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                break;
-            default:
-                GGML_ASSERT(false);
-                break;
-        }
+        ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
        return;
    }

--- a/ggml-cuda/unary.cu
+++ b/ggml-cuda/unary.cu
@@ -48,6 +48,15 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
    dst[i] = fmaxf(x[i], 0);
 }

+static __global__ void sigmoid_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = 1.0f / (1.0f + expf(-x[i]));
+}
+
 static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

@@ -108,6 +117,11 @@ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
    relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }

+static void sigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SIGMOID_BLOCK_SIZE - 1) / CUDA_SIGMOID_BLOCK_SIZE;
+    sigmoid_f32<<<num_blocks, CUDA_SIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
 static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
    hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -188,6 +202,18 @@ void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 }

+void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sigmoid_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
 void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
--- a/ggml-cuda/unary.cuh
+++ b/ggml-cuda/unary.cuh
@@ -4,6 +4,7 @@
 #define CUDA_SILU_BLOCK_SIZE 256
 #define CUDA_TANH_BLOCK_SIZE 256
 #define CUDA_RELU_BLOCK_SIZE 256
+#define CUDA_SIGMOID_BLOCK_SIZE 256
 #define CUDA_HARDSIGMOID_BLOCK_SIZE 256
 #define CUDA_HARDSWISH_BLOCK_SIZE 256
 #define CUDA_SQR_BLOCK_SIZE 256
@@ -18,6 +19,8 @@ void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

+void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
 void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -40,6 +40,7 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_CLAMP,
    GGML_METAL_KERNEL_TYPE_TANH,
    GGML_METAL_KERNEL_TYPE_RELU,
+    GGML_METAL_KERNEL_TYPE_SIGMOID,
    GGML_METAL_KERNEL_TYPE_GELU,
    GGML_METAL_KERNEL_TYPE_GELU_4,
    GGML_METAL_KERNEL_TYPE_GELU_QUICK,
@@ -493,6 +494,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP,                         clamp,                          true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH,                          tanh,                           true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU,                          relu,                           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIGMOID,                       sigmoid,                        true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU,                          gelu,                           true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4,                        gelu_4,                         true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK,                    gelu_quick,                     true);
@@ -730,6 +732,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
            switch (ggml_get_unary_op(op)) {
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_SIGMOID:
                case GGML_UNARY_OP_GELU:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_SILU:
@@ -1192,24 +1195,24 @@ static enum ggml_status ggml_metal_graph_compute(
                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                    } break;
                case GGML_OP_CLAMP:
-                {
-                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline;
+                    {
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline;

-                    float min;
-                    float max;
-                    memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
-                    memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
+                        float min;
+                        float max;
+                        memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
+                        memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));

-                    [encoder setComputePipelineState:pipeline];
-                    [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
-                    [encoder setBytes:&min length:sizeof(min) atIndex:2];
-                    [encoder setBytes:&max length:sizeof(max) atIndex:3];
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
+                        [encoder setBytes:&min length:sizeof(min) atIndex:2];
+                        [encoder setBytes:&max length:sizeof(max) atIndex:3];

-                    const int64_t n = ggml_nelements(dst);
+                        const int64_t n = ggml_nelements(dst);

-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
+                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    } break;
                case GGML_OP_UNARY:
                    switch (ggml_get_unary_op(gf->nodes[i])) {
                        // we are not taking into account the strides, so for now require contiguous tensors
@@ -1237,6 +1240,18 @@ static enum ggml_status ggml_metal_graph_compute(

                                const int64_t n = ggml_nelements(dst);

+                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            } break;
+                        case GGML_UNARY_OP_SIGMOID:
+                            {
+                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIGMOID].pipeline;
+
+                                [encoder setComputePipelineState:pipeline];
+                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                                const int64_t n = ggml_nelements(dst);
+
                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                            } break;
                        case GGML_UNARY_OP_GELU:
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -229,6 +229,13 @@ kernel void kernel_relu(
    dst[tpig] = max(0.0f, src0[tpig]);
 }

+kernel void kernel_sigmoid(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = 1.0f / (1.0f + exp(-src0[tpig]));
+}
+
 kernel void kernel_tanh(
        device const float * src0,
        device       float * dst,
@@ -2210,7 +2217,7 @@ kernel void kernel_flash_attn_ext_f16(

        // ALiBi
        if (max_bias > 0.0f) {
-            const short h = iq2;
+            const uint32_t h = iq2;

            const float base = h < n_head_log2 ? m0 : m1;
            const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
@@ -2466,7 +2473,7 @@ kernel void kernel_flash_attn_ext_vec_f16(

    // ALiBi
    if (max_bias > 0.0f) {
-        const short h = iq2;
+        const uint32_t h = iq2;

        const float base = h < n_head_log2 ? m0 : m1;
        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -14,6 +14,12 @@
 #include <stdlib.h> // for qsort
 #include <stdio.h>  // for GGML_ASSERT

+#if defined(_MSC_VER)
+// disable "possible loss of data" to avoid warnings for hundreds of casts
+// we should just be careful :)
+#pragma warning(disable: 4244 4267)
+#endif
+
 #define UNUSED GGML_UNUSED

 // some compilers don't provide _mm256_set_m128i, e.g. gcc 7
--- a/ggml-rpc.cpp
+++ b/ggml-rpc.cpp
--- a/ggml-rpc.h
+++ b/ggml-rpc.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_RPC_MAX_SERVERS       16
+
+// backend API
+GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
+GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
+
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
+
+GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
+
+GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -15564,26 +15564,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
    const int64_t r2 = ne12/ne02;
    const int64_t r3 = ne13/ne03;

-#if 0
-    // use syclGemmEx
-    {
-        for (int i13 = 0; i13 < ne13; ++i13) {
-            for (int i12 = 0; i12 < ne12; ++i12) {
-                int i03 = i13 / r3;
-                int i02 = i12 / r2;
-
-                SYCL_CHECK(
-                        syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , SYCL_R_16F,   nb01/sizeof(half),
-                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F,   nb11/sizeof(float),
-                            beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
-                            cu_compute_type,
-                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-            }
-        }
-    }
-#else
    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
@@ -15595,7 +15575,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
            nb11 / nb10, nb12 / nb10, beta,
            (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
            ne12 * ne13, cu_compute_type)));
-        g_sycl_handles[g_main_device]->wait();
    } else {
        const int ne23 = ne12*ne13;

@@ -15626,7 +15605,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
                                         nb02, nb03, nb12_scaled, nb13_scaled,
                                         nbd2, nbd3, r2, r3, item_ct1);
                                 });
-            }).wait();
+            });
        }
        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
            *g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
@@ -15637,9 +15616,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
            dpct::library_data_t::real_half, nb11 / nb10, beta,
            (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
            cu_compute_type)));
-        g_sycl_handles[g_main_device]->wait();
    }
-#endif

    if (no_mixed_dtypes) {
        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
--- a/ggml.c
+++ b/ggml.c
@@ -4,7 +4,6 @@
 #include "ggml-impl.h"
 #include "ggml-quants.h"
 #include "ggml.h"
-#include "sgemm.h"

 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -37,6 +36,10 @@
 #undef GGML_USE_LLAMAFILE
 #endif

+#ifdef GGML_USE_LLAMAFILE
+#include "sgemm.h"
+#endif
+
 #if defined(_MSC_VER)
 // disable "possible loss of data" to avoid hundreds of casts
 // we should just be careful :)
@@ -1949,6 +1952,7 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
 inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
 inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
+inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
 // TODO: optimize performance
 inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
 inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
@@ -2329,6 +2333,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
    "TANH",
    "ELU",
    "RELU",
+    "SIGMOID",
    "GELU",
    "GELU_QUICK",
    "SILU",
@@ -2336,7 +2341,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
    "HARDSIGMOID",
 };

-static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
+static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");


 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -4561,6 +4566,20 @@ struct ggml_tensor * ggml_leaky_relu(
    return result;
 }

+// ggml_sigmoid
+
+struct ggml_tensor * ggml_sigmoid(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_SIGMOID);
+}
+
+struct ggml_tensor * ggml_sigmoid_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SIGMOID);
+}
+
 // ggml_gelu

 struct ggml_tensor * ggml_gelu(
@@ -10852,6 +10871,52 @@ static void ggml_compute_forward_relu(
    }
 }

+// ggml_compute_forward_sigmoid
+
+static void ggml_compute_forward_sigmoid_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_sigmoid_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_sigmoid(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sigmoid_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_gelu

 static void ggml_compute_forward_gelu_f32(
@@ -16617,6 +16682,10 @@ static void ggml_compute_forward_unary(
            {
                ggml_compute_forward_relu(params, dst);
            } break;
+        case GGML_UNARY_OP_SIGMOID:
+            {
+                ggml_compute_forward_sigmoid(params, dst);
+            } break;
        case GGML_UNARY_OP_GELU:
            {
                ggml_compute_forward_gelu(params, dst);
@@ -18601,6 +18670,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                        zero_table);
                            }
                        } break;
+                    case GGML_UNARY_OP_SIGMOID:
+                        {
+                            GGML_ASSERT(false); // TODO: not implemented
+                        } break;
                    case GGML_UNARY_OP_GELU:
                        {
                            GGML_ASSERT(false); // TODO: not implemented
@@ -19130,6 +19203,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_ELU:
                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_SIGMOID:
                case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
                case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
                    {
--- a/ggml.h
+++ b/ggml.h
@@ -519,6 +519,7 @@ extern "C" {
        GGML_UNARY_OP_TANH,
        GGML_UNARY_OP_ELU,
        GGML_UNARY_OP_RELU,
+        GGML_UNARY_OP_SIGMOID,
        GGML_UNARY_OP_GELU,
        GGML_UNARY_OP_GELU_QUICK,
        GGML_UNARY_OP_SILU,
@@ -1073,6 +1074,14 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    GGML_API struct ggml_tensor * ggml_sigmoid(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    GGML_API struct ggml_tensor * ggml_gelu(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
--- a/gguf-py/gguf/init.py
+++ b/gguf-py/gguf/init.py
@@ -1,5 +1,7 @@
 from .constants import *
+from .lazy import *
 from .gguf_reader import *
 from .gguf_writer import *
+from .quants import *
 from .tensor_mapping import *
 from .vocab import *
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -10,6 +10,7 @@ from typing import Any
 GGUF_MAGIC             = 0x46554747  # "GGUF"
 GGUF_VERSION           = 3
 GGUF_DEFAULT_ALIGNMENT = 32
+GGML_QUANT_VERSION     = 2  # GGML_QNT_VERSION from ggml.h

 #
 # metadata keys
@@ -838,6 +839,49 @@ class GGMLQuantizationType(IntEnum):
    BF16    = 30


+# TODO: add GGMLFileType from ggml_ftype in ggml.h
+
+
+# from llama_ftype in llama.h
+# ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE.
+class LlamaFileType(IntEnum):
+    ALL_F32              = 0
+    MOSTLY_F16           = 1   # except 1d tensors
+    MOSTLY_Q4_0          = 2   # except 1d tensors
+    MOSTLY_Q4_1          = 3   # except 1d tensors
+    MOSTLY_Q4_1_SOME_F16 = 4   # tok_embeddings.weight and output.weight are F16
+    # MOSTLY_Q4_2        = 5   # support has been removed
+    # MOSTLY_Q4_3        = 6   # support has been removed
+    MOSTLY_Q8_0          = 7   # except 1d tensors
+    MOSTLY_Q5_0          = 8   # except 1d tensors
+    MOSTLY_Q5_1          = 9   # except 1d tensors
+    MOSTLY_Q2_K          = 10  # except 1d tensors
+    MOSTLY_Q3_K_S        = 11  # except 1d tensors
+    MOSTLY_Q3_K_M        = 12  # except 1d tensors
+    MOSTLY_Q3_K_L        = 13  # except 1d tensors
+    MOSTLY_Q4_K_S        = 14  # except 1d tensors
+    MOSTLY_Q4_K_M        = 15  # except 1d tensors
+    MOSTLY_Q5_K_S        = 16  # except 1d tensors
+    MOSTLY_Q5_K_M        = 17  # except 1d tensors
+    MOSTLY_Q6_K          = 18  # except 1d tensors
+    MOSTLY_IQ2_XXS       = 19  # except 1d tensors
+    MOSTLY_IQ2_XS        = 20  # except 1d tensors
+    MOSTLY_Q2_K_S        = 21  # except 1d tensors
+    MOSTLY_IQ3_XS        = 22  # except 1d tensors
+    MOSTLY_IQ3_XXS       = 23  # except 1d tensors
+    MOSTLY_IQ1_S         = 24  # except 1d tensors
+    MOSTLY_IQ4_NL        = 25  # except 1d tensors
+    MOSTLY_IQ3_S         = 26  # except 1d tensors
+    MOSTLY_IQ3_M         = 27  # except 1d tensors
+    MOSTLY_IQ2_S         = 28  # except 1d tensors
+    MOSTLY_IQ2_M         = 29  # except 1d tensors
+    MOSTLY_IQ4_XS        = 30  # except 1d tensors
+    MOSTLY_IQ1_M         = 31  # except 1d tensors
+    MOSTLY_BF16          = 32  # except 1d tensors
+
+    GUESSED              = 1024  # not specified in the model file
+
+
 class GGUFEndian(IntEnum):
    LITTLE = 0
    BIG = 1
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -7,12 +7,13 @@ import struct
 import tempfile
 from enum import Enum, auto
 from io import BufferedWriter
-from typing import IO, Any, Callable, Sequence, Mapping
+from typing import IO, Any, Sequence, Mapping
 from string import ascii_letters, digits

 import numpy as np

 from .constants import (
+    GGML_QUANT_SIZES,
    GGUF_DEFAULT_ALIGNMENT,
    GGUF_MAGIC,
    GGUF_VERSION,
@@ -28,47 +29,6 @@ from .constants import (
 logger = logging.getLogger(__name__)


-class LazyTensor:
-    data: Callable[[], np.ndarray[Any, Any]]
-    # to avoid too deep recursion
-    functions: list[Callable[[np.ndarray[Any, Any]], np.ndarray[Any, Any]]]
-    dtype: np.dtype[Any]
-    shape: tuple[int, ...]
-
-    def __init__(self, data: Callable[[], np.ndarray[Any, Any]], *, dtype: type, shape: tuple[int, ...]):
-        self.data = data
-        self.functions = []
-        self.dtype = np.dtype(dtype)
-        self.shape = shape
-
-    def astype(self, dtype: type, **kwargs) -> LazyTensor:
-        self.functions.append(lambda n: n.astype(dtype, **kwargs))
-        self.dtype = np.dtype(dtype)
-        return self
-
-    @property
-    def nbytes(self) -> int:
-        size = 1
-        for n in self.shape:
-            size *= n
-        return size * self.dtype.itemsize
-
-    def tofile(self, *args, **kwargs) -> None:
-        data = self.data()
-        for f in self.functions:
-            data = f(data)
-        assert data.shape == self.shape
-        assert data.dtype == self.dtype
-        assert data.nbytes == self.nbytes
-        self.functions = []
-        self.data = lambda: data
-        data.tofile(*args, **kwargs)
-
-    def byteswap(self, *args, **kwargs) -> LazyTensor:
-        self.functions.append(lambda n: n.byteswap(*args, **kwargs))
-        return self
-
-
 class WriterState(Enum):
    EMPTY   = auto()
    HEADER  = auto()
@@ -79,7 +39,7 @@ class WriterState(Enum):
 class GGUFWriter:
    fout: BufferedWriter
    temp_file: tempfile.SpooledTemporaryFile[bytes] | None
-    tensors: list[np.ndarray[Any, Any] | LazyTensor]
+    tensors: list[np.ndarray[Any, Any]]
    _simple_value_packing = {
        GGUFValueType.UINT8:   "B",
        GGUFValueType.INT8:    "b",
@@ -236,7 +196,7 @@ class GGUFWriter:
        return ((x + n - 1) // n) * n

    def add_tensor_info(
-        self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32],
+        self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype,
        tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None,
    ) -> None:
        if self.state is not WriterState.EMPTY:
@@ -249,10 +209,6 @@ class GGUFWriter:
        encoded_name = name.encode("utf-8")
        self.ti_data += self._pack("Q", len(encoded_name))
        self.ti_data += encoded_name
-        n_dims = len(tensor_shape)
-        self.ti_data += self._pack("I", n_dims)
-        for i in range(n_dims):
-            self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
        if raw_dtype is None:
            if tensor_dtype == np.float16:
                dtype = GGMLQuantizationType.F16
@@ -272,13 +228,22 @@ class GGUFWriter:
                raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
        else:
            dtype = raw_dtype
+            if tensor_dtype == np.uint8:
+                block_size, type_size = GGML_QUANT_SIZES[raw_dtype]
+                if tensor_shape[-1] % type_size != 0:
+                    raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})")
+                tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,)
+        n_dims = len(tensor_shape)
+        self.ti_data += self._pack("I", n_dims)
+        for i in range(n_dims):
+            self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
        self.ti_data += self._pack("I", dtype)
        self.ti_data += self._pack("Q", self.offset_tensor)
        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
        self.ti_data_count += 1

    def add_tensor(
-        self, name: str, tensor: np.ndarray[Any, Any] | LazyTensor, raw_shape: Sequence[int] | None = None,
+        self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
        raw_dtype: GGMLQuantizationType | None = None,
    ) -> None:
        if self.endianess == GGUFEndian.BIG:
@@ -303,7 +268,7 @@ class GGUFWriter:
        if pad != 0:
            fp.write(bytes([0] * pad))

-    def write_tensor_data(self, tensor: np.ndarray[Any, Any] | LazyTensor) -> None:
+    def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
        if self.state is not WriterState.TI_DATA:
            raise ValueError(f'Expected output file to contain tensor info, got {self.state}')

@@ -391,7 +356,7 @@ class GGUFWriter:
    def add_name(self, name: str) -> None:
        self.add_string(Keys.General.NAME, name)

-    def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None:
+    def add_quantization_version(self, quantization_version: int) -> None:
        self.add_uint32(
            Keys.General.QUANTIZATION_VERSION, quantization_version)

--- a/gguf-py/gguf/lazy.py
+++ b/gguf-py/gguf/lazy.py
@@ -0,0 +1,236 @@
+from __future__ import annotations
+from abc import ABC, ABCMeta, abstractmethod
+
+import logging
+from typing import Any, Callable
+from collections import deque
+
+import numpy as np
+from numpy._typing import _Shape
+from numpy.typing import DTypeLike
+
+
+logger = logging.getLogger(__name__)
+
+
+class LazyMeta(ABCMeta):
+
+    def __new__(cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs):
+        def __getattr__(self, __name: str) -> Any:
+            meta_attr = getattr(self._meta, __name)
+            if callable(meta_attr):
+                return type(self)._wrap_fn(
+                    (lambda s, *args, **kwargs: getattr(s, __name)(*args, **kwargs)),
+                    use_self=self,
+                )
+            elif isinstance(meta_attr, self._tensor_type):
+                # e.g. self.T with torch.Tensor should still be wrapped
+                return type(self)._wrap_fn(lambda s: getattr(s, __name))(self)
+            else:
+                # no need to wrap non-tensor properties,
+                # and they likely don't depend on the actual contents of the tensor
+                return meta_attr
+
+        namespace["__getattr__"] = __getattr__
+
+        # need to make a builder for the wrapped wrapper to copy the name,
+        # or else it fails with very cryptic error messages,
+        # because somehow the same string would end up in every closures
+        def mk_wrap(op_name: str, *, meta_noop: bool = False):
+            # need to wrap the wrapper to get self
+            def wrapped_special_op(self, *args, **kwargs):
+                return type(self)._wrap_fn(
+                    getattr(type(self)._tensor_type, op_name),
+                    meta_noop=meta_noop,
+                )(self, *args, **kwargs)
+            return wrapped_special_op
+
+        # special methods bypass __getattr__, so they need to be added manually
+        # ref: https://docs.python.org/3/reference/datamodel.html#special-lookup
+        # NOTE: doing this from a metaclass is very convenient
+        # TODO: make this even more comprehensive
+        for binary_op in (
+            "lt", "le", "eq", "ne", "ge", "gt", "not"
+            "abs", "add", "and", "floordiv", "invert", "lshift", "mod", "mul", "matmul",
+            "neg", "or", "pos", "pow", "rshift", "sub", "truediv", "xor",
+            "iadd", "iand", "ifloordiv", "ilshift", "imod", "imul", "ior", "irshift", "isub", "ixor",
+            "radd", "rand", "rfloordiv", "rmul", "ror", "rpow", "rsub", "rtruediv", "rxor",
+        ):
+            attr_name = f"__{binary_op}__"
+            # the result of these operators usually has the same shape and dtype as the input,
+            # so evaluation on the meta tensor can be skipped.
+            namespace[attr_name] = mk_wrap(attr_name, meta_noop=True)
+
+        for special_op in (
+            "getitem", "setitem", "len",
+        ):
+            attr_name = f"__{special_op}__"
+            namespace[attr_name] = mk_wrap(attr_name, meta_noop=False)
+
+        return super().__new__(cls, name, bases, namespace, **kwargs)
+
+
+# Tree of lazy tensors
+class LazyBase(ABC, metaclass=LazyMeta):
+    _tensor_type: type
+    _meta: Any
+    _data: Any | None
+    _lazy: deque[LazyBase]  # shared within a graph, to avoid deep recursion when making eager
+    _args: tuple
+    _func: Callable[[tuple], Any] | None
+
+    def __init__(self, *, meta: Any, data: Any | None = None, lazy: deque[LazyBase] | None = None, args: tuple = (), func: Callable[[tuple], Any] | None = None):
+        super().__init__()
+        self._meta = meta
+        self._data = data
+        self._lazy = lazy if lazy is not None else deque()
+        self._args = args
+        self._func = func
+        assert self._func is not None or self._data is not None
+        if self._data is None:
+            self._lazy.append(self)
+
+    def __init_subclass__(cls) -> None:
+        if "_tensor_type" not in cls.__dict__:
+            raise TypeError(f"property '_tensor_type' must be defined for {cls!r}")
+        return super().__init_subclass__()
+
+    @staticmethod
+    def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
+        # TODO: dict and set
+        if isinstance(o, (list, tuple)):
+            L = []
+            for item in o:
+                L.append(LazyBase._recurse_apply(item, fn))
+            if isinstance(o, tuple):
+                L = tuple(L)
+            return L
+        elif isinstance(o, LazyBase):
+            return fn(o)
+        else:
+            return o
+
+    @classmethod
+    def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
+        def wrapped_fn(*args, **kwargs):
+            if kwargs is None:
+                kwargs = {}
+            args = ((use_self,) if use_self is not None else ()) + args
+
+            meta_args = LazyBase._recurse_apply(args, lambda t: t._meta)
+
+            if isinstance(meta_noop, bool) and not meta_noop:
+                try:
+                    res = fn(*meta_args, **kwargs)
+                except NotImplementedError:
+                    # running some operations on PyTorch's Meta tensors can cause this exception
+                    res = None
+            else:
+                # some operators don't need to actually run on the meta tensors
+                assert len(args) > 0
+                res = args[0]
+                assert isinstance(res, cls)
+                res = res._meta
+                # allow operations to override the dtype and shape
+                if meta_noop is not True:
+                    if isinstance(meta_noop, tuple):
+                        dtype, shape = meta_noop
+                        assert callable(shape)
+                        res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape))
+                    else:
+                        res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
+
+            if isinstance(res, cls._tensor_type):
+                def collect_replace(t: LazyBase):
+                    if collect_replace.shared_lazy is None:
+                        collect_replace.shared_lazy = t._lazy
+                    else:
+                        collect_replace.shared_lazy.extend(t._lazy)
+                        t._lazy = collect_replace.shared_lazy
+
+                # emulating a static variable
+                collect_replace.shared_lazy = None
+
+                LazyBase._recurse_apply(args, collect_replace)
+
+                shared_lazy = collect_replace.shared_lazy
+
+                return cls(meta=cls.eager_to_meta(res), lazy=shared_lazy, args=args, func=lambda a: fn(*a, **kwargs))
+            else:
+                del res  # not needed
+                # non-tensor return likely relies on the contents of the args
+                # (e.g. the result of torch.equal)
+                eager_args = cls.to_eager(args)
+                return fn(*eager_args, **kwargs)
+        return wrapped_fn
+
+    @classmethod
+    def to_eager(cls, t: Any) -> Any:
+        def simple_to_eager(_t: LazyBase) -> Any:
+            def already_eager_to_eager(_t: LazyBase) -> Any:
+                assert _t._data is not None
+                return _t._data
+
+            while _t._data is None:
+                lt = _t._lazy.popleft()
+                if lt._data is not None:
+                    # Lazy tensor did not belong in the lazy queue.
+                    # Weirdly only happens with Bloom models...
+                    # likely because tensors aren't unique in the queue.
+                    # The final output is still the same as in eager mode,
+                    # so it's safe to ignore this.
+                    continue
+                assert lt._func is not None
+                lt._args = cls._recurse_apply(lt._args, already_eager_to_eager)
+                lt._data = lt._func(lt._args)
+                # sanity check
+                assert lt._data.dtype == lt._meta.dtype
+                assert lt._data.shape == lt._meta.shape
+
+            return _t._data
+
+        # recurse into lists and/or tuples, keeping their structure
+        return cls._recurse_apply(t, simple_to_eager)
+
+    @classmethod
+    def eager_to_meta(cls, t: Any) -> Any:
+        return cls.meta_with_dtype_and_shape(t.dtype, t.shape)
+
+    # must be overridden, meta tensor init is backend-specific
+    @classmethod
+    @abstractmethod
+    def meta_with_dtype_and_shape(cls, dtype: Any, shape: Any) -> Any: pass
+
+    @classmethod
+    def from_eager(cls, t: Any) -> Any:
+        if type(t) is cls:
+            # already eager
+            return t
+        elif isinstance(t, cls._tensor_type):
+            return cls(meta=cls.eager_to_meta(t), data=t)
+        else:
+            return TypeError(f"{type(t)!r} is not compatible with {cls._tensor_type!r}")
+
+
+class LazyNumpyTensor(LazyBase):
+    _tensor_type = np.ndarray
+
+    @classmethod
+    def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: _Shape) -> np.ndarray[Any, Any]:
+        # The initial idea was to use np.nan as the fill value,
+        # but non-float types like np.int16 can't use that.
+        # So zero it is.
+        cheat = np.zeros(1, dtype)
+        return np.lib.stride_tricks.as_strided(cheat, shape, (0 for _ in shape))
+
+    def astype(self, dtype, *args, **kwargs):
+        meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape)
+        full_args = (self, dtype,) + args
+        # very important to pass the shared _lazy deque, or else there's an infinite loop somewhere.
+        return type(self)(meta=meta, args=full_args, lazy=self._lazy, func=(lambda a: a[0].astype(*a[1:], **kwargs)))
+
+    def tofile(self, *args, **kwargs):
+        eager = LazyNumpyTensor.to_eager(self)
+        return eager.tofile(*args, **kwargs)
+
+    # TODO: __array_function__
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+from typing import Callable
+
+from numpy.typing import DTypeLike
+
+from .constants import GGML_QUANT_SIZES, GGMLQuantizationType
+from .lazy import LazyNumpyTensor
+
+import numpy as np
+
+
+# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
+def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
+    n = n.astype(np.float32, copy=False).view(np.int32)
+    # force nan to quiet
+    n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n)
+    # flush subnormals to zero
+    n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n)
+    # round to nearest even
+    n = (n + (0x7fff + ((n >> 16) & 1))) >> 16
+    return n.astype(np.int16)
+
+
+# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
+def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
+    rows = arr.reshape((-1, arr.shape[-1]))
+    osize = 1
+    for dim in oshape:
+        osize *= dim
+    out = np.empty(shape=osize, dtype=otype)
+    # compute over groups of 16 rows (arbitrary, but seems good for performance)
+    n_groups = rows.shape[0] // 16
+    np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
+    return out.reshape(oshape)
+
+
+def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
+    return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.int16, oshape=n.shape)
+
+
+__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.int16)
+
+
+def quantize_bf16(n: np.ndarray):
+    if type(n) is LazyNumpyTensor:
+        return __quantize_bf16_lazy(n)
+    else:
+        return __quantize_bf16_array(n)
+
+
+__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]
+
+
+def can_quantize_to_q8_0(n: np.ndarray) -> bool:
+    return n.shape[-1] % __q8_block_size == 0
+
+
+# round away from zero
+# ref: https://stackoverflow.com/a/59143326/22827863
+def np_roundf(n: np.ndarray) -> np.ndarray:
+    a = abs(n)
+    floored = np.floor(a)
+    b = floored + np.floor(2 * (a - floored))
+    return np.sign(n) * b
+
+
+def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
+    return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)
+
+
+# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
+def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
+    shape = n.shape
+    assert shape[-1] % __q8_block_size == 0
+
+    n_blocks = n.size // __q8_block_size
+
+    blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)
+
+    d = abs(blocks).max(axis=1, keepdims=True) / 127
+    with np.errstate(divide="ignore"):
+        id = np.where(d == 0, 0, 1 / d)
+    qs = np_roundf(blocks * id)
+
+    # (n_blocks, 2)
+    d = d.astype(np.float16).view(np.uint8)
+    # (n_blocks, block_size)
+    qs = qs.astype(np.int8).view(np.uint8)
+
+    assert d.shape[1] + qs.shape[1] == __q8_type_size
+
+    return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))
+
+
+def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
+    return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))
+
+
+__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
+    __quantize_q8_0_array,
+    meta_noop=(np.uint8, __quantize_q8_0_shape_change),
+)
+
+
+def quantize_q8_0(data: np.ndarray):
+    if type(data) is LazyNumpyTensor:
+        return __quantize_q8_0_lazy(data)
+    else:
+        return __quantize_q8_0_array(data)
--- a/llama.cpp
+++ b/llama.cpp
@@ -7,6 +7,10 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"

+#ifdef GGML_USE_RPC
+#  include "ggml-rpc.h"
+#endif
+
 #ifdef GGML_USE_CUDA
 #  include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
@@ -1685,91 +1689,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
    GGML_UNUSED(host_buffer);
 }

-static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
-    ggml_backend_buffer_type_t buft = nullptr;
-
-#ifdef GGML_USE_METAL
-    buft = ggml_backend_metal_buffer_type();
-#elif defined(GGML_USE_CUDA)
-    buft = ggml_backend_cuda_buffer_type(gpu);
-#elif defined(GGML_USE_VULKAN)
-    buft = ggml_backend_vk_buffer_type(gpu);
-#elif defined(GGML_USE_SYCL)
-    buft = ggml_backend_sycl_buffer_type(gpu);
-#elif defined(GGML_USE_CLBLAST)
-    buft = ggml_backend_opencl_buffer_type();
-#elif defined(GGML_USE_KOMPUTE)
-    buft = ggml_backend_kompute_buffer_type(gpu);
-    if (buft == nullptr) {
-        LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
-    }
-#endif
-
-    if (buft == nullptr) {
-        buft = llama_default_buffer_type_cpu(true);
-    }
-    return buft;
-
-    GGML_UNUSED(gpu);
-}
-
-static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
-    ggml_backend_buffer_type_t buft = nullptr;
-
-#ifdef GGML_USE_CUDA
-    if (ggml_backend_cuda_get_device_count() > 1) {
-        buft = ggml_backend_cuda_split_buffer_type(tensor_split);
-    }
-#endif
-
-#ifdef GGML_USE_SYCL
-    if (ggml_backend_sycl_get_device_count() > 1) {
-        buft = ggml_backend_sycl_split_buffer_type(tensor_split);
-    }
-#endif
-
-    if (buft == nullptr) {
-        buft = llama_default_buffer_type_offload(fallback_gpu);
-    }
-    return buft;
-
-    GGML_UNUSED(tensor_split);
-}
-
-static size_t llama_get_device_count() {
-#if defined(GGML_USE_CUDA)
-    return ggml_backend_cuda_get_device_count();
-#elif defined(GGML_USE_SYCL)
-    return ggml_backend_sycl_get_device_count();
-#elif defined(GGML_USE_VULKAN)
-    return ggml_backend_vk_get_device_count();
-#else
-    return 1;
-#endif
-}
-
-static size_t llama_get_device_memory(int device) {
-#if defined(GGML_USE_CUDA)
-    size_t total;
-    size_t free;
-    ggml_backend_cuda_get_device_memory(device, &free, &total);
-    return free;
-#elif defined(GGML_USE_SYCL)
-    size_t total;
-    size_t free;
-    ggml_backend_sycl_get_device_memory(device, &free, &total);
-    return free;
-#elif defined(GGML_USE_VULKAN)
-    size_t total;
-    size_t free;
-    ggml_backend_vk_get_device_memory(device, &free, &total);
-    return free;
-#else
-    return 1;
-    GGML_UNUSED(device);
-#endif
-}
-
 //
 // globals
 //
@@ -2210,6 +2129,8 @@ struct llama_model {
    int main_gpu;
    int n_gpu_layers;

+    std::vector<std::string> rpc_servers;
+
    // gguf metadata
    std::unordered_map<std::string, std::string> gguf_kv;

@@ -2353,6 +2274,104 @@ struct llama_context {
 #endif
 };

+static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
+    ggml_backend_buffer_type_t buft = nullptr;
+
+#ifdef GGML_USE_RPC
+    std::string endpoint = model.rpc_servers[gpu];
+    buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
+#elif defined(GGML_USE_METAL)
+    buft = ggml_backend_metal_buffer_type();
+#elif defined(GGML_USE_CUDA)
+    buft = ggml_backend_cuda_buffer_type(gpu);
+#elif defined(GGML_USE_VULKAN)
+    buft = ggml_backend_vk_buffer_type(gpu);
+#elif defined(GGML_USE_SYCL)
+    buft = ggml_backend_sycl_buffer_type(gpu);
+#elif defined(GGML_USE_CLBLAST)
+    buft = ggml_backend_opencl_buffer_type();
+#elif defined(GGML_USE_KOMPUTE)
+    buft = ggml_backend_kompute_buffer_type(gpu);
+    if (buft == nullptr) {
+        LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
+    }
+#endif
+
+    if (buft == nullptr) {
+        buft = llama_default_buffer_type_cpu(true);
+    }
+    return buft;
+    GGML_UNUSED(model);
+    GGML_UNUSED(gpu);
+}
+
+static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
+    ggml_backend_buffer_type_t buft = nullptr;
+
+#ifdef GGML_USE_CUDA
+    if (ggml_backend_cuda_get_device_count() > 1) {
+        buft = ggml_backend_cuda_split_buffer_type(tensor_split);
+    }
+#endif
+
+#ifdef GGML_USE_SYCL
+    if (ggml_backend_sycl_get_device_count() > 1) {
+        buft = ggml_backend_sycl_split_buffer_type(tensor_split);
+    }
+#endif
+
+    if (buft == nullptr) {
+        buft = llama_default_buffer_type_offload(model, fallback_gpu);
+    }
+    return buft;
+
+    GGML_UNUSED(tensor_split);
+}
+
+static size_t llama_get_device_count(const llama_model & model) {
+#if defined(GGML_USE_RPC)
+    return model.rpc_servers.size();
+#elif defined(GGML_USE_CUDA)
+    return ggml_backend_cuda_get_device_count();
+#elif defined(GGML_USE_SYCL)
+    return ggml_backend_sycl_get_device_count();
+#elif defined(GGML_USE_VULKAN)
+    return ggml_backend_vk_get_device_count();
+#else
+    return 1;
+#endif
+    GGML_UNUSED(model);
+}
+
+static size_t llama_get_device_memory(const llama_model & model, int device) {
+#if defined(GGML_USE_RPC)
+    size_t total;
+    size_t free;
+    std::string endpoint = model.rpc_servers[device];
+    ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
+    return free;
+#elif defined(GGML_USE_CUDA)
+    size_t total;
+    size_t free;
+    ggml_backend_cuda_get_device_memory(device, &free, &total);
+    return free;
+#elif defined(GGML_USE_SYCL)
+    size_t total;
+    size_t free;
+    ggml_backend_sycl_get_device_memory(device, &free, &total);
+    return free;
+#elif defined(GGML_USE_VULKAN)
+    size_t total;
+    size_t free;
+    ggml_backend_vk_get_device_memory(device, &free, &total);
+    return free;
+#else
+    return 1;
+#endif
+    GGML_UNUSED(model);
+    GGML_UNUSED(device);
+}
+
 //
 // kv cache helpers
 //
@@ -2805,6 +2824,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
    cache.do_defrag = true;
 }

+static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
+    // the FA kernels require padding to avoid extra runtime boundary checks
+    return cparams.flash_attn ? 256u : 32u;
+}
+
 //
 // model loading and saving
 //
@@ -4424,7 +4448,9 @@ static void llm_load_vocab(
            } else if (
                    tokenizer_pre == "gpt-2"   ||
                    tokenizer_pre == "jina-es" ||
-                    tokenizer_pre == "jina-de") {
+                    tokenizer_pre == "jina-de" ||
+                    tokenizer_pre == "jina-v2-es" ||
+                    tokenizer_pre == "jina-v2-de") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
            } else if (
                    tokenizer_pre == "refact") {
@@ -4784,13 +4810,13 @@ static bool llm_load_tensors(

    if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
        // calculate the split points
-        int device_count = llama_get_device_count();
+        int device_count = llama_get_device_count(model);
        bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
        std::vector<float> splits(device_count);
        if (all_zero) {
            // default split, by free memory
            for (int i = 0; i < device_count; ++i) {
-                splits[i] = llama_get_device_memory(i);
+                splits[i] = llama_get_device_memory(model, i);
            }
        } else {
            std::copy(tensor_split, tensor_split + device_count, splits.begin());
@@ -4810,35 +4836,35 @@ static bool llm_load_tensors(
        int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
        for (int64_t i = i_gpu_start; i < n_layer; ++i) {
            int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
-            model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
+            model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
        }
        // assign the output layer
        if (n_gpu_layers > n_layer) {
            int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
-            model.buft_output = llama_default_buffer_type_offload(layer_gpu);
+            model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
        } else {
            model.buft_output = llama_default_buffer_type_cpu(true);
        }
    } else {
        ggml_backend_buffer_type_t split_buft;
        if (split_mode == LLAMA_SPLIT_MODE_ROW) {
-            split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
+            split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
        } else {
            // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
-            split_buft = llama_default_buffer_type_offload(main_gpu);
+            split_buft = llama_default_buffer_type_offload(model, main_gpu);
        }
        // assign the repeating layers
        for (int64_t i = i_gpu_start; i < n_layer; ++i) {
            model.buft_layer[i] = {
                split_buft,
-                llama_default_buffer_type_offload(main_gpu)
+                llama_default_buffer_type_offload(model, main_gpu)
            };
        }
        // assign the output layer
        if (n_gpu_layers > n_layer) {
            model.buft_output = {
                split_buft,
-                llama_default_buffer_type_offload(main_gpu)
+                llama_default_buffer_type_offload(model, main_gpu)
            };
        } else {
            model.buft_output = llama_default_buffer_type_cpu(true);
@@ -11508,7 +11534,8 @@ static int llama_decode_internal(
                // a heuristic, to avoid attending the full cache if it is not yet utilized
                // after enough generations, the benefit from this heuristic disappears
                // if we start defragmenting the cache, the benefit from this will be more important
-                kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
+                const uint32_t pad = llama_kv_cache_get_padding(cparams);
+                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
                //kv_self.n = llama_kv_cache_cell_max(kv_self);
            }
        }
@@ -13174,6 +13201,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
    return rejects;
 }

+static bool llama_grammar_detect_left_recursion(
+        const std::vector<std::vector<llama_grammar_element>> & rules,
+        size_t                                                  rule_index,
+        std::vector<bool>                                     * rules_visited,
+        std::vector<bool>                                     * rules_in_progress,
+        std::vector<bool>                                     * rules_may_be_empty) {
+    if ((*rules_in_progress)[rule_index]) {
+        return true;
+    }
+
+    (*rules_in_progress)[rule_index] = true;
+
+    const std::vector<llama_grammar_element> & rule = rules[rule_index];
+
+    // First check if the rule might produce the empty string. This could be done combined with the second
+    // step but it's more readable as two steps.
+    bool at_rule_start = true;
+    for (size_t i = 0; i < rule.size(); i++) {
+        if (llama_grammar_is_end_of_sequence(&rule[i])) {
+            if (at_rule_start) {
+                (*rules_may_be_empty)[rule_index] = true;
+                break;
+            }
+            at_rule_start = true;
+        } else {
+            at_rule_start = false;
+        }
+    }
+
+    // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
+    // be empty)
+    bool recurse_into_nonterminal = true;
+    for (size_t i = 0; i < rule.size(); i++) {
+        if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
+            if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
+                return true;
+            }
+            if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
+                recurse_into_nonterminal = false;
+            }
+        } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
+            recurse_into_nonterminal = true;
+        } else {
+            recurse_into_nonterminal = false;
+        }
+    }
+
+    (*rules_in_progress)[rule_index] = false;
+    (*rules_visited)[rule_index] = true;
+    return false;
+}
+
 //
 // grammar - external
 //
@@ -13193,6 +13272,19 @@ struct llama_grammar * llama_grammar_init(
        vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
    }

+    // Check for left recursion
+    std::vector<bool> rules_visited(n_rules);
+    std::vector<bool> rules_in_progress(n_rules);
+    std::vector<bool> rules_may_be_empty(n_rules);
+    for (size_t i = 0; i < n_rules; i++) {
+        if (rules_visited[i]) {
+            continue;
+        }
+        if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
+            throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
+        }
+    }
+
    // loop over alternates of start rule to build initial stacks
    std::vector<std::vector<const llama_grammar_element *>> stacks;
    pos = vec_rules[start_rule_index].data();
@@ -13215,6 +13307,9 @@ struct llama_grammar * llama_grammar_init(
        }
    } while (true);

+    // Important: vec_rules has to be moved here, not copied, because stacks contains
+    // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
+    // then the pointers would be invalidated when the local vec_rules goes out of scope.
    return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
 }

@@ -15314,6 +15409,7 @@ struct llama_model_params llama_model_default_params() {
        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
        /*.main_gpu                    =*/ 0,
        /*.tensor_split                =*/ nullptr,
+        /*.rpc_servers                 =*/ nullptr,
        /*.progress_callback           =*/ nullptr,
        /*.progress_callback_user_data =*/ nullptr,
        /*.kv_overrides                =*/ nullptr,
@@ -15384,7 +15480,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
 }

 size_t llama_max_devices(void) {
-#if defined(GGML_USE_METAL)
+#if defined(GGML_USE_RPC)
+    return GGML_RPC_MAX_SERVERS;
+#elif defined(GGML_USE_METAL)
    return 1;
 #elif defined(GGML_USE_CUDA)
    return GGML_CUDA_MAX_DEVICES;
@@ -15407,7 +15505,7 @@ bool llama_supports_mlock(void) {

 bool llama_supports_gpu_offload(void) {
 #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
-    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
+    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
    // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
    return true;
 #else
@@ -15470,7 +15568,17 @@ struct llama_model * llama_load_model_from_file(
            return true;
        };
    }
-
+    if (params.rpc_servers != nullptr) {
+        // split the servers set them into model->rpc_servers
+        std::string servers(params.rpc_servers);
+        size_t pos = 0;
+        while ((pos = servers.find(",")) != std::string::npos) {
+            std::string server = servers.substr(0, pos);
+            model->rpc_servers.push_back(server);
+            servers.erase(0, pos + 1);
+        }
+        model->rpc_servers.push_back(servers);
+    }
    int status = llama_model_load(path_model, *model, params);
    GGML_ASSERT(status <= 0);
    if (status < 0) {
@@ -15509,6 +15617,11 @@ struct llama_context * llama_new_context_with_model(
        return nullptr;
    }

+    if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
+        LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
+        params.flash_attn = false;
+    }
+
    llama_context * ctx = new llama_context(*model);

    const auto & hparams = model->hparams;
@@ -15532,7 +15645,7 @@ struct llama_context * llama_new_context_with_model(
    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;

    // this is necessary due to kv_self.n being padded later during inference
-    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, 256);
+    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));

    // with causal attention, the batch size is limited by the context size
    cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -15577,11 +15690,6 @@ struct llama_context * llama_new_context_with_model(
        }
    }

-    if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
-        LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
-        cparams.flash_attn = false;
-    }
-
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }
@@ -15617,7 +15725,17 @@ struct llama_context * llama_new_context_with_model(

    if (!hparams.vocab_only) {
        // initialize backends
-#ifdef GGML_USE_METAL
+#if defined(GGML_USE_RPC)
+        for (auto & server : model->rpc_servers) {
+            ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
+            if (backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
+                llama_free(ctx);
+                return nullptr;
+            }
+            ctx->backends.push_back(backend);
+        }
+#elif defined(GGML_USE_METAL)
        if (model->n_gpu_layers > 0) {
            ctx->backend_metal = ggml_backend_metal_init();
            if (ctx->backend_metal == nullptr) {
@@ -15773,7 +15891,11 @@ struct llama_context * llama_new_context_with_model(
            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));

            // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
-            bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
+            bool pipeline_parallel =
+                llama_get_device_count(*model) > 1 &&
+                model->n_gpu_layers > (int)model->hparams.n_layer &&
+                model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
+                params.offload_kqv;
 #ifndef GGML_USE_CUDA
            // pipeline parallelism requires support for async compute and events
            // currently this is only implemented in the CUDA backend
--- a/llama.h
+++ b/llama.h
@@ -242,6 +242,9 @@ extern "C" {
        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
        const float * tensor_split;

+        // comma separated list of RPC servers to use for offloading
+        const char * rpc_servers;
+
        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
        // If the provided progress_callback returns true, model loading continues.
        // If it returns false, model loading is immediately aborted.
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,5 +9,4 @@
 -r ./requirements/requirements-convert-hf-to-gguf.txt
 -r ./requirements/requirements-convert-hf-to-gguf-update.txt
 -r ./requirements/requirements-convert-llama-ggml-to-gguf.txt
-r ./requirements/requirements-convert-lora-to-ggml.txt
 -r ./requirements/requirements-convert-persimmon-to-gguf.txt
--- a/requirements/requirements-convert-lora-to-ggml.txt
+++ b/requirements/requirements-convert-lora-to-ggml.txt
@@ -1,2 +0,0 @@
-r ./requirements-convert.txt
-torch~=2.1.1
--- a/scripts/debug-test.sh
+++ b/scripts/debug-test.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+test_suite=${1:-}
+test_number=${2:-}
+
+PROG=${0##*/}
+build_dir="build-ci-debug"
+
+if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
+    echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
+    echo "Debug specific ctest program."
+    echo
+    echo "Options:"
+    echo "  -h, --help       Display this help and exit"
+    echo
+    echo "Arguments:"
+    echo "  <test_regex>     (Mandatory) Supply one regex to the script to filter tests"
+    echo "  (test_number)    (Optional) Test number to run a specific test"
+    echo
+    echo "Example:"
+    echo "  $PROG test-tokenizer"
+    echo "  $PROG test-tokenizer 3"
+    echo
+    exit 0
+fi
+
+# Function to select and debug a test
+function select_test() {
+    test_suite=${1:-test}
+    test_number=${2:-}
+
+    # Sanity Check If Tests Is Detected
+    printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
+    tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
+    if [ ${#tests[@]} -eq 0 ]
+    then
+        echo "No tests avaliable... check your compliation process..."
+        echo "Exiting."
+        exit 1
+    fi
+
+    if [ -z $test_number ]
+    then
+        # List out avaliable tests
+        printf "Which test would you like to debug?\n"
+        id=0
+        for s in "${tests[@]}"
+        do
+            echo "Test# ${id}"
+            echo "  $s"
+            ((id++))
+        done
+
+        # Prompt user which test they wanted to run
+        printf "\nRun test#? "
+        read test_number
+    else
+        printf "\nUser Already Requested #${test_number}"
+    fi
+
+    # Start GDB with the requested test binary and arguments
+    printf "Debugging(GDB) test: ${tests[test_number]}\n"
+    # Change IFS (Internal Field Separator)
+    sIFS=$IFS
+    IFS=$'\n'
+
+    # Get test args
+    gdb_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' ))
+    IFS=$sIFS
+    printf "Debug arguments: ${gdb_args[test_number]}\n\n"
+
+    # Expand paths if needed
+    args=()
+    for x in $(echo ${gdb_args[test_number]} | sed -e 's/"\/\<//' -e 's/\>"//')
+    do
+        args+=($(echo $x | sed -e 's/.*\/..\//..\//'))
+    done
+
+    # Execute debugger
+    echo "gdb args: ${args[@]}"
+    gdb --args ${args[@]}
+}
+
+# Step 0: Check the args
+if [ -z "$test_suite" ]
+then
+    echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
+    echo "Supply one regex to the script to filter tests,"
+    echo "and optionally a test number to run a specific test."
+    echo "Use --help flag for full instructions"
+    exit 1
+fi
+
+# Step 1: Reset and Setup folder context
+## Sanity check that we are actually in a git repo
+repo_root=$(git rev-parse --show-toplevel)
+if [ ! -d "$repo_root" ]; then
+    echo "Error: Not in a Git repository."
+    exit 1
+fi
+
+## Reset folder to root context of git repo
+pushd "$repo_root" || exit 1
+
+## Create and enter build directory
+rm -rf "$build_dir" && mkdir "$build_dir" || exit 1
+
+# Step 2: Setup Build Environment and Compile Test Binaries
+cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON || exit 1
+pushd "$build_dir" && make -j || exit 1
+
+# Step 3: Debug the Test
+select_test "$test_suite" "$test_number"
+
+# Step 4: Return to the directory from which the user ran the command.
+popd || exit 1
+popd || exit 1
+popd || exit 1
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-98875cdb7e9ceeb726d1c196d2fecb3cbb59b93a
+30f54cbb3ada3e4c5bc6924de3e5918e5be4ff11
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2,6 +2,7 @@
 #include <ggml-alloc.h>
 #include <ggml-backend.h>
 #include <ggml-backend-impl.h>
+
 #include <algorithm>
 #include <array>
 #include <cfloat>
@@ -2173,11 +2174,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    test_cases.emplace_back(new test_timestep_embedding());
    test_cases.emplace_back(new test_leaky_relu());

-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    for (int hs : { 64, 128, }) { // other head sizes not implemented
-#else
    for (int hs : { 64, 80, 128, 256, }) {
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
        for (float max_bias : {0.0f, 8.0f}) {
            for (int nh : { 32, }) {
                for (int kv : { 512, 1024, }) {
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -28,6 +28,19 @@ static llama_grammar* build_grammar(const std::string & grammar_str) {
    return grammar;
 }

+static bool test_build_grammar_fails(const std::string & grammar_str) {
+    fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
+    bool grammar_fails = false;
+    try {
+        build_grammar(grammar_str);
+        fprintf(stderr, "  ❌ Expected build failure, but succeeded\n");
+    } catch (const std::exception & err) {
+        grammar_fails = true;
+        fprintf(stdout, "  ✅︎\n");
+    }
+    return grammar_fails;
+}
+
 static bool match_string(const std::string & input, llama_grammar* grammar) {
    auto decoded = decode_utf8(input, {});

@@ -320,6 +333,38 @@ number ::= [0-9]+)""";
    fprintf(stderr, "  ✅︎ Passed\n");
 }

+static void test_failure_left_recursion() {
+    fprintf(stderr, "⚫ Testing left recursion detection:\n");
+
+    // Test simple left recursion detection
+    const std::string simple_str = R"""(root ::= "a" | root "a")""";
+    assert(test_build_grammar_fails(simple_str));
+
+    // Test more complicated left recursion detection
+    const std::string medium_str = R"""(
+root ::= asdf
+asdf ::= "a" | asdf "a"
+)""";
+    assert(test_build_grammar_fails(medium_str));
+
+    // Test even more complicated left recursion detection
+    const std::string hard_str = R"""(
+root ::= asdf
+asdf ::= "a" | foo "b"
+foo ::= "c" | asdf "d" | "e")""";
+    assert(test_build_grammar_fails(hard_str));
+
+    // Test yet even more complicated left recursion detection
+    const std::string hardest_str = R"""(
+root ::= asdf
+asdf ::= "a" | foo "b"
+foo ::= "c" | empty asdf "d" | "e"
+empty ::= "blah" | )""";
+    assert(test_build_grammar_fails(hardest_str));
+
+    fprintf(stderr, "  ✅︎ Passed\n");
+}
+
 int main() {
    fprintf(stdout, "Running grammar integration tests...\n");
    test_simple_grammar();
@@ -327,6 +372,7 @@ int main() {
    test_quantifiers();
    test_failure_missing_root();
    test_failure_missing_reference();
+    test_failure_left_recursion();
    fprintf(stdout, "All tests passed.\n");
    return 0;
 }
Author	SHA1	Message	Date
Brian	1265c670fd	Revert "move ndk code to a new library (#6951 )" (#7282 ) This reverts commit `efc8f767c8`.	2024-05-14 16:10:39 +03:00
Radoslav Gerganov	5e31828d3e	ggml : add RPC backend (#6829 ) * ggml : add RPC backend The RPC backend proxies all operations to a remote server which runs a regular backend (CPU, CUDA, Metal, etc). * set TCP_NODELAY * add CI workflows * Address review comments * fix warning * implement llama_max_devices() for RPC * Address review comments * Address review comments * wrap sockfd into a struct * implement get_alignment and get_max_size * add get_device_memory * fix warning * win32 support * add README * readme : trim trailing whitespace * Address review comments * win32 fix * Address review comments * fix compile warnings on macos	2024-05-14 14:27:19 +03:00
slaren	541600201e	llama : disable pipeline parallelism with nkvo (#7265 )	2024-05-14 17:33:42 +10:00
Elton Kola	efc8f767c8	move ndk code to a new library (#6951 )	2024-05-14 17:30:30 +10:00
Haggai Nuchi	e0f556186b	Add left recursion check: quit early instead of going into an infinite loop (#7083 ) * Add left recursion check: quit early instead of going into an infinite loop * Remove custom enum, rename left recursion check and move to "grammar internal" section, add handling for edge case where a leftmost nonterminal may be empty * Remove unnecessary declaration	2024-05-14 15:25:56 +10:00
Ryuei	27f65d6267	docs: Fix typo and update description for --embeddings flag (#7026 ) - Change '--embedding' to '--embeddings' in the README - Update the description to match the latest --help output - Added a caution about defining physical batch size	2024-05-14 15:20:47 +10:00
compilade	ee52225067	convert-hf : support direct Q8_0 conversion (#7234 ) * convert-hf : support q8_0 conversion * convert-hf : add missing ftype This was messing with the checksums otherwise. * convert-hf : add missing ftype to Baichuan and Xverse I didn't notice these on my first pass.	2024-05-13 14:10:51 -04:00
Georgi Gerganov	614d3b914e	llama : less KV padding when FA is off (#7257 ) ggml-ci	2024-05-13 17:15:15 +03:00
k.h.lai	30e70334f7	llava-cli: fix base64 prompt (#7248 )	2024-05-14 00:02:36 +10:00
Johannes Gäßler	1c570d8bee	perplexity: add BF16 vs. FP16 results (#7150 )	2024-05-13 13:03:27 +02:00
Neo Zhang	948f4ec7c5	[SYCL] rm wait() (#7233 )	2024-05-13 18:11:26 +08:00
Joan Fontanals	9aa672490c	llama : rename jina tokenizers to v2 (#7249 ) * refactor: rename jina tokenizers to v2 * refactor: keep refactoring non-breaking	2024-05-13 11:35:14 +03:00
Brian	b1f8af1886	convert.py: Outfile default name change and additional metadata support (#4858 ) * convert.py: Outfile default name change and additional metadata support * convert.py: don't stringify Metadata load method output * convert.py: typo fix * convert.py: fix metadata format to sync with LLM_KV_NAMES in llama.cpp	2024-05-13 12:56:47 +10:00
Benjamin Findley	e586ee4259	change default temperature of OAI compat API from 0 to 1 (#7226 ) * change default temperature of OAI compat API from 0 to 1 * make tests explicitly send temperature to OAI API	2024-05-13 12:40:08 +10:00
Neo Zhang	cbf75894d2	[SYCL] Add oneapi runtime dll files to win release package (#7241 ) * add oneapi running time dlls to release package * fix path * fix path * fix path * fix path * fix path --------- Co-authored-by: Zhang <jianyu.zhang@intel.com>	2024-05-13 08:04:29 +08:00
Neo Zhang	0d5cef78ae	[SYCL] update CI with oneapi 2024.1 (#7235 ) Co-authored-by: Zhang <jianyu.zhang@intel.com>	2024-05-13 08:02:55 +08:00
Johannes Gäßler	dc685be466	CUDA: add FP32 FlashAttention vector kernel (#7188 ) * CUDA: add FP32 FlashAttention vector kernel * fixup! CUDA: add FP32 FlashAttention vector kernel * fixup! fixup! CUDA: add FP32 FlashAttention vector kernel * fixup! fixup! fixup! CUDA: add FP32 FlashAttention vector kernel	2024-05-12 19:40:45 +02:00
Georgi Gerganov	6f1b63606f	cmake : fix version cmp (#7227 )	2024-05-12 18:30:23 +03:00
slaren	b228aba91a	remove convert-lora-to-ggml.py (#7204 )	2024-05-12 02:29:33 +02:00
Georgi Gerganov	7bd4ffb780	metal : fix warnings (skipme) (#0 )	2024-05-11 21:38:13 +03:00
Georgi Gerganov	1622ac023f	sync : ggml	2024-05-11 21:35:05 +03:00
Georgi Gerganov	6aeff24f8b	metal : fix indent (ggml/0)	2024-05-11 21:34:21 +03:00
Georgi Gerganov	325756d28d	ggml : resolve merge (ggml/0) ggml-ci	2024-05-11 21:33:08 +03:00
Josh Ramer	fed0108491	Scripting & documenting debugging one test without anything else in the loop. (#7096 ) * A little documentation that shares my quick tips for working in the repository. * Update startup-testing-debugging.md * script that shows a menu of tests to pick from & run the debugger on * debug-test.sh: Refactor CLI help message * debug-test.sh: documentation update * debug-test.sh: CLI Help output corrections * debug-test.sh: minor doc fix --------- authored-by: Josh Ramer <ubuntu@ip-172-31-32-53.ec2.internal> Assisted-by: brian khuu <mofosyne@gmail.com>	2024-05-12 03:26:35 +10:00
Xuan Son Nguyen	72c177c1f6	fix system prompt handling (#7153 )	2024-05-11 17:28:10 +02:00
compilade	5a419926b0	convert-hf : support bfloat16 conversion (#7158 ) * convert-hf : support bfloat16 conversion * gguf-py : flake8 fixes * convert-hf : add missing space after comma * convert-hf : get bit-exact same output as ./quantize The quantization version was missing. * convert-hf : don't round bf16 NANs * convert-hf : save some memory with np.int16 intermediate bf16 weights * convert-hf : more closely match llama.cpp with which weights to keep in f32 * convert-hf : add --outtype auto-f16 A reason for this to exist is for model quantizers who want an initial GGUF with the most fidelity to the original model while still using a 16-bit float type instead of 32-bit floats. * convert-hf : remove a semicolon because flake8 doesn't like it It's a reflex from when programming in C/C++, I guess. * convert-hf : support outtype templating in outfile name * convert-hf : rename --outtype auto-f16 to --outtype auto	2024-05-11 11:06:26 -04:00
Georgi Gerganov	fae9d234b6	sync : ggml ggml-ci	2024-05-11 15:38:34 +03:00
Justina Cho	f5ef34e428	feat: implemented sigmoid function (ggml/806) * added sigmoid function * implemented metal kernel for sigmoid * implemented cuda kernel for sigmoid * added sigmoid unary op and incremented count	2024-05-11 15:38:34 +03:00
Borislav Stanimirov	ef0d5e3ec9	build: fix and ignore msvc warnings (ggml/805)	2024-05-11 15:38:34 +03:00
CrispStrobe	3292733f95	convert : skip unaccessible HF repos (#7210 )	2024-05-11 11:18:35 +03:00
Steve Grubb	988631335a	server : free llama_batch on exit (#7212 ) * [server] Cleanup a memory leak on exit There are a couple memory leaks on exit of the server. This hides others. After cleaning this up, you can see leaks on slots. But that is another patch to be sent after this. * make tab into spaces	2024-05-11 11:13:02 +03:00