metal : minor

common : don't do warm-up with more than n_batch tokens (close #3058 )
ggml-ci
2026-04-23 16:37:33 +03:00 · 2023-09-07 15:33:21 +03:00 · 2023-09-07 15:32:38 +03:00 · 2023-09-07 15:20:07 +03:00 · 2023-09-07 14:59:48 +03:00 · 2023-09-07 14:11:55 +03:00
4 changed files with 49 additions and 46 deletions
--- a/30
+++ b/30
@@ -42,9 +42,9 @@ endif

 default: $(BUILD_TARGETS)

-test: $(TEST_TARGETS)
-	@failures=0; \
-	for test_target in $(TEST_TARGETS); do \
+test:
+	@echo "Running tests..."
+	@for test_target in $(TEST_TARGETS); do \
 		if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
 			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
@@ -52,21 +52,10 @@ test: $(TEST_TARGETS)
 		elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
 			continue; \
 		else \
-			echo "Running test $$test_target..."; \
 			./$$test_target; \
 		fi; \
-		if [ $$? -ne 0 ]; then \
-			printf 'Test $$test_target FAILED!\n\n' $$test_target; \
-			failures=$$(( failures + 1 )); \
-		else \
-			printf 'Test %s passed.\n\n' $$test_target; \
-		fi; \
-	done; \
-	if [ $$failures -gt 0 ]; then \
-		printf '\n%s tests failed.\n' $$failures; \
-		exit 1; \
-	fi
-	@echo 'All tests passed.'
+	done
+	@echo "All tests have been run."

 all: $(BUILD_TARGETS) $(TEST_TARGETS)

@@ -102,8 +91,8 @@ else
 OPT = -O3
 endif
 MK_CPPFLAGS = -I. -Icommon
-MK_CFLAGS   = $(OPT) -std=c11   -fPIC
-MK_CXXFLAGS = $(OPT) -std=c++11 -fPIC
+MK_CFLAGS   = $(CPPFLAGS) $(OPT) -std=c11   -fPIC
+MK_CXXFLAGS = $(CPPFLAGS) $(OPT) -std=c++11 -fPIC
 MK_LDFLAGS  =

 ifdef LLAMA_DEBUG
@@ -392,8 +381,9 @@ k_quants.o: k_quants.c k_quants.h
 endif # LLAMA_NO_K_QUANTS

 # combine build flags with cmdline overrides
-override CFLAGS   := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
-override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
+override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
+override CFLAGS   := $(MK_CFLAGS) $(CFLAGS)
+override CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
 override LDFLAGS  := $(MK_LDFLAGS) $(LDFLAGS)

 #
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -986,12 +986,7 @@ int main(int argc, char ** argv) {
        test t(inst, lmodel, ctx);

        // warmup run
-        if (t.n_prompt > 0) {
-            test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
-        }
-        if (t.n_gen > 0) {
-            test_gen(ctx, 1, 0, t.n_threads);
-        }
+        test_gen(ctx, 1, 0, t.n_threads);

        for (int i = 0; i < params.reps; i++) {
            uint64_t t_start = get_time_ns();
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1141,7 +1141,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];

-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_DUP:
                    case GGML_OP_CPY:
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -220,14 +220,29 @@ kernel void kernel_norm(
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
-    const float mean  = sum[0] / ne00;
-
-    // recenter and VARIANCE
+    // broadcast
+    if (tpitg == 0) {
+        sum[0] /= ne00;
+    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
+    const float mean  = sum[0];
+
+    // recenter
    device float * y = dst + tgpig*ne00;
-    sum[tpitg] = 0.0f;
    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
        y[i00] = x[i00] - mean;
+    }
+
+    // VARIANCE
+    // parallel sum
+    //
+    // WARNING: combining this loop with the one above will give you wrong results for nth == 256
+    //          I have no idea why, so for now I am keeping them separate. But this behavior is very concerning.
+    //          Tested with:
+    //          ./perplexity -m ./falcon-7b/ggml-model-q4_0.gguf -f wiki.test.raw -ngl 1 -t 4
+    //
+    sum[tpitg] = 0.0f;
+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
        sum[tpitg] += y[i00] * y[i00];
    }

@@ -239,7 +254,12 @@ kernel void kernel_norm(
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
-    const float variance = sum[0] / ne00;
+    // broadcast
+    if (tpitg == 0) {
+        sum[0] /= ne00;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    const float variance = sum[0];

    const float scale = 1.0f/sqrt(variance + eps);
    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@@ -682,27 +702,25 @@ kernel void kernel_rope(
        constant       int & mode,
        constant     float & freq_base,
        constant     float & freq_scale,
-        uint  tiitg[[thread_index_in_threadgroup]],
-        uint3 tptg[[threads_per_threadgroup]],
-        uint3 tgpig[[threadgroup_position_in_grid]]) {
-    const int64_t i3 = tgpig[2];
-    const int64_t i2 = tgpig[1];
-    const int64_t i1 = tgpig[0];
+        uint3 tpig[[thread_position_in_grid]]) {
+    const int64_t i3 = tpig[2];
+    const int64_t i2 = tpig[1];
+    const int64_t i1 = tpig[0];

    const bool is_neox = mode & 2;
+    const float theta_scale = pow(freq_base, -2.0f/n_dims);

    const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);

-    const float theta_0 = freq_scale * (float)p;
-    const float inv_ndims = -1.f/n_dims;
+    float theta = freq_scale * (float)p;

    if (!is_neox) {
-        for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
-
-            const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
+        for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
            const float cos_theta = cos(theta);
            const float sin_theta = sin(theta);

+            theta *= theta_scale;
+
            device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
            device       float * dst_data  = (device float *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

@@ -714,12 +732,12 @@ kernel void kernel_rope(
        }
    } else {
        for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-            for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
-
-                const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
+            for (int64_t ic = 0; ic < n_dims; ic += 2) {
                const float cos_theta = cos(theta);
                const float sin_theta = sin(theta);

+                theta *= theta_scale;
+
                const int64_t i0 = ib*n_dims + ic/2;

                device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
Author	SHA1	Message	Date
Georgi Gerganov	2f689dee06	metal : minor	2023-09-07 15:33:21 +03:00
Georgi Gerganov	efac2d469f	common : don't do warm-up with more than n_batch tokens (close #3058 ) ggml-ci	2023-09-07 15:32:38 +03:00
Georgi Gerganov	783379670a	metal : restore original F16 mat-vec multiplication It works after the norm fixes	2023-09-07 15:20:07 +03:00
Georgi Gerganov	ed92c3d4b2	metal : put warning in kernel_norm to not combine the loops	2023-09-07 14:59:48 +03:00
Georgi Gerganov	5e1c4089d8	metal : fix kernel_norm ggml-ci	2023-09-07 14:11:55 +03:00