llama : better replace_all

baby-llama : remove duplicate vector include
flake.lock: Update (#8847 )
2026-02-19 14:13:22 +02:00 · 2024-08-04 13:42:08 +03:00 · 2024-08-04 13:24:59 +03:00 · 2024-08-03 19:53:20 -07:00 · 2024-08-03 18:34:41 +02:00
8 changed files with 47 additions and 37 deletions
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -1,7 +1,6 @@
 #include "ggml.h"
 #include "train.h"

-#include <vector>
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1719994518,
-        "narHash": "sha256-pQMhCCHyQGRzdfAkdJ4cIWiw+JNuWsTX7f0ZYSyz0VY=",
+        "lastModified": 1722555600,
+        "narHash": "sha256-XOQkdLafnb/p9ij77byFQjDf5m5QYl9b2REiVClC+x4=",
        "owner": "hercules-ci",
        "repo": "flake-parts",
-        "rev": "9227223f6d922fee3c7b190b2cc238a99527bbb7",
+        "rev": "8471fe90ad337a8074e957b69ca4d0089218391d",
        "type": "github"
      },
      "original": {
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1722062969,
-        "narHash": "sha256-QOS0ykELUmPbrrUGmegAUlpmUFznDQeR4q7rFhl8eQg=",
+        "lastModified": 1722421184,
+        "narHash": "sha256-/DJBI6trCeVnasdjUo9pbnodCLZcFqnVZiLUfqLH4jA=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "b73c2221a46c13557b1b3be9c2070cc42cf01eb3",
+        "rev": "9f918d616c5321ad374ae6cb5ea89c9e04bf3e58",
        "type": "github"
      },
      "original": {
@@ -36,14 +36,14 @@
    },
    "nixpkgs-lib": {
      "locked": {
-        "lastModified": 1719876945,
-        "narHash": "sha256-Fm2rDDs86sHy0/1jxTOKB1118Q0O3Uc7EC0iXvXKpbI=",
+        "lastModified": 1722555339,
+        "narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=",
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
      },
      "original": {
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
      }
    },
    "root": {
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -384,8 +384,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@@ -496,8 +496,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@@ -614,7 +614,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (svcntw() == 8) {
+    if (ggml_sve_cnt_b == QK8_0) {
        const void * b_ptr = vx;
        const void * a_ptr = vy;
        float * res_ptr = s;
@@ -680,12 +680,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
        return;
    }
    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                    "performance");
    }
    else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                    "quantization format for optimal performance");
    }
@@ -745,8 +745,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@@ -1266,8 +1266,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@@ -1728,7 +1728,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (svcntw() == 8) {
+    if (ggml_sve_cnt_b == QK8_0) {
        const void * b_ptr = vx;
        const void * a_ptr = vy;
        float * res_ptr = s;
@@ -2139,12 +2139,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
        return;
    }
    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                    "performance");
    }
    else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                    "quantization format for optimal performance");
    }
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -143,6 +143,7 @@ extern "C" {

 #if defined(__ARM_FEATURE_SVE)
 #include <arm_sve.h>
+#include <sys/prctl.h>
 #endif

 // 16-bit float
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -3818,7 +3818,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    float sumf = 0;

 #if defined(__ARM_FEATURE_SVE)
-    if (svcntb() == QK8_0) {
+    if (ggml_sve_cnt_b == QK8_0) {
        const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
        const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);

@@ -5303,7 +5303,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    float sumf = 0;

 #if defined(__ARM_FEATURE_SVE)
-    if (svcntb() == QK8_0) {
+    if (ggml_sve_cnt_b == QK8_0) {
        svfloat32_t sumv0 = svdup_n_f32(0.0f);
        svfloat32_t sumv1 = svdup_n_f32(0.0f);

--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -127,6 +127,10 @@ void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);

+#if defined(__ARM_FEATURE_SVE)
+extern int ggml_sve_cnt_b;
+#endif
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -37,6 +37,9 @@
 #include <unistd.h>
 #endif

+#if defined(__ARM_FEATURE_SVE)
+int ggml_sve_cnt_b = 0;
+#endif
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif
@@ -3558,6 +3561,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {

    GGML_ASSERT_ALIGNED(ctx->mem_buffer);

+#if defined(__ARM_FEATURE_SVE)
+    if (!ggml_sve_cnt_b) {
+        ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+    }
+#endif
+
    GGML_PRINT_DEBUG("%s: context initialized\n", __func__);

    ggml_critical_section_end();
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -122,17 +122,14 @@ static std::string trim(const std::string & str) {
 }

 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    std::string result;
-    for (size_t pos = 0; ; pos += search.length()) {
-        auto new_pos = s.find(search, pos);
-        if (new_pos == std::string::npos) {
-            result += s.substr(pos, s.size() - pos);
-            break;
-        }
-        result += s.substr(pos, new_pos - pos) + replace;
-        pos = new_pos;
+    if (search.empty()) {
+        return; // Avoid infinite loop if 'search' is an empty string
+    }
+    size_t pos = 0;
+    while ((pos = s.find(search, pos)) != std::string::npos) {
+        s.replace(pos, search.length(), replace);
+        pos += replace.length();
    }
-    s = std::move(result);
 }

 static bool is_float_close(float a, float b, float abs_tol) {
Author	SHA1	Message	Date
Georgi Gerganov	bddcc5f985	llama : better replace_all	2024-08-04 13:42:08 +03:00
Daniel Bevenius	01aae2b497	baby-llama : remove duplicate vector include	2024-08-04 13:24:59 +03:00
Georgi Gerganov	4b77ea95f5	flake.lock: Update (#8847 ) Some checks failed Nix CI / nix-eval (macos-latest) (push) Waiting to run Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run Nix CI / nix-build (macos-latest) (push) Waiting to run Nix CI / nix-build (ubuntu-latest) (push) Waiting to run flake8 Lint / Lint (push) Waiting to run Nix aarch64 builds / nix-build-aarch64 (push) Has been cancelled	2024-08-03 19:53:20 -07:00
jdomke	76614f352e	ggml : reading the runtime sve config of the cpu (#8709 ) Some checks are pending Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-rocm.Dockerfile platforms:linux/amd64,linux/arm64 tag:light-rocm]) (push) Waiting to run Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-rocm.Dockerfile platforms:linux/amd64,linux/arm64 tag:server-rocm]) (push) Waiting to run Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run Nix CI / nix-eval (macos-latest) (push) Waiting to run Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run Nix CI / nix-build (macos-latest) (push) Waiting to run Nix CI / nix-build (ubuntu-latest) (push) Waiting to run flake8 Lint / Lint (push) Waiting to run * ggml : reading the runtime sve config of the cpu * change to one time init to prevent performance drop * prefix variable to avoid possible conflicts * revert xxhash fix and add brackets --------- Co-authored-by: domke <673751-domke@users.noreply.gitlab.com>	2024-08-03 18:34:41 +02:00