Merge branch 'master' into gg/ggml_scale

ggml-ci
tests : fix test-grad0
2026-04-16 16:27:32 +03:00 · 2023-12-21 22:35:11 +02:00 · 2023-12-21 21:34:31 +02:00 · 2023-12-21 21:02:23 +02:00 · 2023-12-21 20:50:24 +02:00
10 changed files with 23 additions and 87 deletions
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -52,36 +52,6 @@ jobs:
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}

-      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: false
-
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: true
-          swap-storage: true
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
      - name: Build and push Docker image (versioned)
        if: github.event_name == 'push'
        uses: docker/build-push-action@v4
@@ -89,7 +59,7 @@ jobs:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}

      - name: Build and push Docker image (tagged)
@@ -98,5 +68,5 @@ jobs:
          context: .
          push: ${{ github.event_name == 'push' }}
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}" , "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
          file: ${{ matrix.config.dockerfile }}
--- a/5
+++ b/5
@@ -452,9 +452,6 @@ ifdef LLAMA_HIPBLAS
 	LLAMA_CUDA_MMV_Y        ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
 	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
-ifdef LLAMA_HIP_UMA
-	MK_CPPFLAGS += -DGGML_HIP_UMA
-endif # LLAMA_HIP_UMA
 	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
 	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
@@ -609,7 +606,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual

-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
+gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
--- a/README.md
+++ b/README.md
@@ -123,7 +123,6 @@ as the main playground for developing new features for the [ggml](https://github
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)

 **UI:**

@@ -440,13 +439,7 @@ Building the program with BLAS support may lead to some performance improvements
        && cmake --build build -- -j 16
    ```
    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
-    However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
-
-  - Using `make` (example for target gfx1030, build with 16 CPU threads):
-    ```bash
-    make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
-    ```
-
+    However, this hurts performance for non-integrated GPUs.
  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
    ```bash
    set PATH=%HIP_PATH%\bin;%PATH%
--- a/examples/gguf/CMakeLists.txt
+++ b/examples/gguf/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET gguf)
 add_executable(${TARGET} gguf.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "llama.h"

 #include <cstdio>
 #include <cinttypes>
--- a/ggml.c
+++ b/ggml.c
@@ -15335,8 +15335,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                    const int n_past = ((int32_t *) tensor->op_params)[0];
                    src0->grad =
                        ggml_add_or_set(ctx, src0->grad,
-                            /* ggml_diag_mask_inf_impl() shouldn't be here */
-                            /* ref:  https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
                            ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
                        zero_table);
                }
--- a/ggml.h
+++ b/ggml.h
@@ -484,8 +484,7 @@ extern "C" {
    enum ggml_log_level {
        GGML_LOG_LEVEL_ERROR = 2,
        GGML_LOG_LEVEL_WARN = 3,
-        GGML_LOG_LEVEL_INFO = 4,
-        GGML_LOG_LEVEL_DEBUG = 5
+        GGML_LOG_LEVEL_INFO = 4
    };

    // ggml object
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@@ -3,7 +3,7 @@
 This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
 (GGML Universal File) format.

-See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py)
+See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py)
 as an example for its usage.

 ## Installation
--- a/llama.cpp
+++ b/llama.cpp
@@ -2372,8 +2372,7 @@ struct llama_model_loader {
        }
    }

-    // Returns false if cancelled by progress_callback
-    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
+    void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
        size_t size_data = 0;

        for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
@@ -2405,9 +2404,7 @@ struct llama_model_loader {
            GGML_ASSERT(cur); // unused tensors should have been caught by load_data already

            if (progress_callback) {
-                if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
-                    return false;
-                }
+                progress_callback((float) size_done / size_data, progress_callback_user_data);
            }

            const size_t offs = file_offset(ggml_get_name(cur));
@@ -2469,11 +2466,8 @@ struct llama_model_loader {
        }

        if (progress_callback) {
-            // Even though the model is done loading, we still honor
-            // cancellation since we need to free allocations.
-            return progress_callback(1.0f, progress_callback_user_data);
+            progress_callback(1.0f, progress_callback_user_data);
        }
-        return true;
    }
 };

@@ -3050,8 +3044,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
    if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
 }

-// Returns false if cancelled by progress_callback
-static bool llm_load_tensors(
+static void llm_load_tensors(
        llama_model_loader & ml,
        llama_model & model,
        int n_gpu_layers,
@@ -3729,20 +3722,16 @@ static bool llm_load_tensors(
        model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
    }

-    if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
-        return false;
-    }
+    ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL);

    model.mapping = std::move(ml.mapping);

    // loading time will be recalculate after the first eval, so
    // we take page faults deferred by mmap() into consideration
    model.t_load_us = ggml_time_us() - model.t_start_us;
-    return true;
 }

-// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
-static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
+static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
    try {
        llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);

@@ -3760,21 +3749,19 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons

        if (params.vocab_only) {
            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
-            return 0;
+            return true;
        }

-        if (!llm_load_tensors(
+        llm_load_tensors(
            ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
            params.progress_callback, params.progress_callback_user_data
-        )) {
-            return -2;
-        }
+        );
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
-        return -1;
+        return false;
    }

-    return 0;
+    return true;
 }

 //
@@ -9154,18 +9141,11 @@ struct llama_model * llama_load_model_from_file(
                    LLAMA_LOG_INFO("\n");
                }
            }
-            return true;
        };
    }

-    int status = llama_model_load(path_model, *model, params);
-    GGML_ASSERT(status <= 0);
-    if (status < 0) {
-        if (status == -1) {
-            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
-        } else if (status == -2) {
-            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
-        }
+    if (!llama_model_load(path_model, *model, params)) {
+        LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
        delete model;
        return nullptr;
    }
--- a/llama.h
+++ b/llama.h
@@ -127,7 +127,7 @@ extern "C" {
        bool sorted;
    } llama_token_data_array;

-    typedef bool (*llama_progress_callback)(float progress, void *ctx);
+    typedef void (*llama_progress_callback)(float progress, void *ctx);

    // Input data for llama_decode
    // A llama_batch object can contain input about one or many sequences
@@ -180,9 +180,7 @@ extern "C" {
        int32_t main_gpu;     // the GPU that is used for scratch and small tensors
        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)

-        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
-        // If the provided progress_callback returns true, model loading continues.
-        // If it returns false, model loading is immediately aborted.
+        // called with a progress value between 0 and 1, pass NULL to disable
        llama_progress_callback progress_callback;

        // context pointer passed to the progress callback
Author	SHA1	Message	Date
Georgi Gerganov	ab1b75166f	Merge branch 'master' into gg/ggml_scale ggml-ci	2023-12-21 22:35:11 +02:00
Georgi Gerganov	b784f881c3	tests : fix test-grad0 ggml-ci	2023-12-21 21:34:31 +02:00
Georgi Gerganov	36c3f41f66	ggml : fix CPU implementation	2023-12-21 21:02:23 +02:00
Georgi Gerganov	199f6bdc46	ggml : change ggml_scale to take a float instead of tensor	2023-12-21 20:50:24 +02:00