llama : add flash attention (demo)

2026-04-23 16:37:33 +03:00 · 2023-04-05 22:12:04 +03:00
4 changed files with 35 additions and 43 deletions
--- a/README.md
+++ b/README.md
@@ -350,22 +350,20 @@ We have two Docker images available for this project:

 The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.

-Replace `/path/to/models` below with the actual path where you downloaded the models.
-
 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
 ```

 On complete, you are ready to play!

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 or with light image:

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 ### Contributing
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -431,7 +431,7 @@ int main(int argc, char ** argv) {
        }

        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos()) {
+        if (embd.back() == llama_token_eos()) {
            if (params.instruct) {
                is_interacting = true;
            } else {
--- a/ggml.c
+++ b/ggml.c
@@ -7238,6 +7238,7 @@ static void ggml_compute_forward_rope_f32(
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
    assert(src1->type == GGML_TYPE_I32);
    assert(ggml_nelements(src1) == 3);

@@ -7264,28 +7265,11 @@ static void ggml_compute_forward_rope_f32(

    assert(nb0 == sizeof(float));

-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
+    // TODO: optimize
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
            const int p = (mode == 0 ? n_past + i2 : i2);
            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
                for (int i0 = 0; i0 < n_dims; i0 += 2) {
                    const float theta = powf(10000.0, ((float)-i0)/n_dims);

@@ -7311,6 +7295,7 @@ static void ggml_compute_forward_rope_f16(
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
    assert(src1->type == GGML_TYPE_I32);
    assert(ggml_nelements(src1) == 3);

@@ -7337,28 +7322,10 @@ static void ggml_compute_forward_rope_f16(

    assert(nb0 == sizeof(ggml_fp16_t));

-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
            const int p = (mode == 0 ? n_past + i2 : i2);
            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
                for (int i0 = 0; i0 < n_dims; i0 += 2) {
                    const float theta = powf(10000.0, ((float)-i0)/n_dims);

@@ -9457,7 +9424,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                    } break;
                case GGML_OP_ROPE:
                    {
-                        node->n_tasks = n_threads;
+                        node->n_tasks = 1;
                    } break;
                case GGML_OP_CONV_1D_1S:
                case GGML_OP_CONV_1D_2S:
--- a/llama.cpp
+++ b/llama.cpp
@@ -28,6 +28,8 @@
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16

+#define LLAMA_USE_FLASH_ATTN
+
 #define LLAMA_ASSERT(x) \
    do { \
        if (!(x)) { \
@@ -829,6 +831,30 @@ static bool llama_eval_internal(
                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
            }

+#ifdef LLAMA_USE_FLASH_ATTN
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F16, n_embd/n_head, n_head, N)),
+                        0, 2, 1, 3);
+
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, kv_self.v,
+                        n_past + N, n_embd/n_head, n_head,
+                        n_ctx*ggml_element_size(kv_self.v),
+                        n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
+                        il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
+
+            struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
+#else
            struct ggml_tensor * Q =
                ggml_permute(ctx0,
                        Qcur,
@@ -872,6 +898,7 @@ static bool llama_eval_internal(
            // is there a better way?
            struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
+#endif
 #endif

            // KQV_merged = KQV.permute(0, 2, 1, 3)