Support dup & cont ops on CUDA (#2242 )

llama : fix t_start_sample_us initialization warning (#2238 )
ggml : fixed runtime bugs and compile errors related to GGML_PERF and GGML_DEBUG (#2219 )
2026-04-23 16:37:33 +03:00 · 2023-07-17 20:39:29 +03:00 · 2023-07-17 00:01:45 +03:00 · 2023-07-16 22:57:28 +03:00 · 2023-07-16 22:54:47 +03:00
5 changed files with 27 additions and 13 deletions
--- a/README.md
+++ b/README.md
@@ -640,7 +640,7 @@ Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files t

 ```bash
 # run the verification script
-python3 .\scripts\verify-checksum-models.py
+./scripts/verify-checksum-models.py
 ```

 - On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory:
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -3537,6 +3537,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
    (void) dst;
 }

+void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_cpy(src0, dst, nullptr);
+    (void) src1;
+}
+
 void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
@@ -3670,7 +3675,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
    // recursively assign CUDA buffers until a compute tensor is found
    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
        const ggml_op src0_op = tensor->src[0]->op;
-        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
+        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
        }
    }
@@ -3776,6 +3781,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);

    switch (tensor->op) {
+        case GGML_OP_DUP:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cuda_dup;
+            break;
        case GGML_OP_ADD:
            if (!any_on_device) {
                return false;
@@ -3830,6 +3841,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
            }
            func = ggml_cuda_cpy;
            break;
+        case GGML_OP_CONT:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cuda_dup;
+            break;
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
        case GGML_OP_PERMUTE:
--- a/ggml.c
+++ b/ggml.c
@@ -4412,8 +4412,8 @@ void ggml_free(struct ggml_context * ctx) {
        if (&g_state.contexts[i].context == ctx) {
            g_state.contexts[i].used = false;

-            GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
-                    __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
+            GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
+                    __func__, i, ggml_used_mem(ctx));

            if (ctx->mem_buffer_owned) {
                GGML_ALIGNED_FREE(ctx->mem_buffer);
@@ -16317,8 +16317,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                if (GGML_OP_HAS_FINALIZE[node->op]) {
                    params.nth = n_tasks_arr[node_n];
                    ggml_compute_forward(&params, node);
-                    ggml_graph_compute_perf_stats_node(node, state->shared);
                }
+                ggml_graph_compute_perf_stats_node(node, state->shared);
            }

            // distribute new work or execute it direct if 1T
@@ -16348,8 +16348,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                    if (GGML_OP_HAS_FINALIZE[node->op]) {
                        params.type = GGML_TASK_FINALIZE;
                        ggml_compute_forward(&params, node);
-                        ggml_graph_compute_perf_stats_node(node, state->shared);
                    }
+
+                    ggml_graph_compute_perf_stats_node(node, state->shared);
                } else {
                    break;
                }
@@ -16891,9 +16892,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
 }

 void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
-    //assert(cgraph->work      == NULL);
-    //assert(cgraph->work_size == 0);
-
    uint64_t size_eval = 0;

    // compute size of intermediate results
@@ -17332,9 +17330,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {

    GGML_PRINT("=== GRAPH ===\n");

-    GGML_PRINT_DEBUG("n_threads       = %d\n",        cgraph->n_threads);
-    GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
-
    GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * node = cgraph->nodes[i];
--- a/llama.cpp
+++ b/llama.cpp
@@ -2205,7 +2205,7 @@ void llama_sample_classifier_free_guidance(
          struct llama_context * guidance_ctx,
                         float   scale,
                         float   smooth_factor) {
-    int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
+    int64_t t_start_sample_us = ggml_time_us();

    assert(ctx);
    auto n_vocab = llama_n_vocab(ctx);
--- a/scripts/verify-checksum-models.py
+++ b/scripts/verify-checksum-models.py
@@ -1,3 +1,5 @@
+#!/bin/env python3
+
 import os
 import hashlib
Author	SHA1	Message	Date
Jiahao Li	7568d1a2b2	Support dup & cont ops on CUDA (#2242 )	2023-07-17 20:39:29 +03:00
Alex Klinkhamer	b7647436cc	llama : fix t_start_sample_us initialization warning (#2238 )	2023-07-17 00:01:45 +03:00
Qingyou Meng	672dda10e4	ggml : fixed runtime bugs and compile errors related to GGML_PERF and GGML_DEBUG (#2219 ) * fixed runtime bugs and compile errors related to GGML_PERF and GGML_DEBUG * remove ifdef GGML_PERF; update fmt	2023-07-16 22:57:28 +03:00
Jiří Podivín	27ab66e437	py : turn verify-checksum-models.py into executable (#2245 ) README.md was adjusted to reflect the change. Signed-off-by: Jiri Podivin <jpodivin@gmail.com>	2023-07-16 22:54:47 +03:00