test [pack]

test
ci : allow creating artifacts on PRs on demand
2026-04-30 16:47:31 +03:00 · 2025-01-24 23:24:31 +01:00 · 2025-01-24 22:03:31 +01:00 · 2025-01-24 21:36:11 +01:00
10 changed files with 78 additions and 230 deletions
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -2,10 +2,6 @@ ARG UBUNTU_VERSION=22.04

 FROM ubuntu:$UBUNTU_VERSION AS build

-ARG TARGETARCH
-
-ARG GGML_CPU_ARM_ARCH=armv8-a
-
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev

@@ -13,14 +9,7 @@ WORKDIR /app

 COPY . .

-RUN if [ "$TARGETARCH" = "amd64" ]; then \
-        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
-    elif [ "$TARGETARCH" = "arm64" ]; then \
-        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
-    else \
-        echo "Unsupported architecture"; \
-        exit 1; \
-    fi && \
+RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
    cmake --build build -j $(nproc)

 RUN mkdir -p /app/lib && \
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -31,6 +31,7 @@ env:
  LLAMA_LOG_COLORS: 1
  LLAMA_LOG_PREFIX: 1
  LLAMA_LOG_TIMESTAMPS: 1
+  CREATE_ARTIFACTS: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' || contains(github.event.pull_request.labels.*.name, 'artifacts') }}

 jobs:
  macOS-latest-cmake-arm64:
@@ -85,14 +86,14 @@ jobs:

      - name: Pack artifacts
        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ env.CREATE_ARTIFACTS == 'true' }}
        run: |
          cp LICENSE ./build/bin/
          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*

      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ env.CREATE_ARTIFACTS == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
@@ -149,14 +150,14 @@ jobs:

      - name: Pack artifacts
        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ env.CREATE_ARTIFACTS == 'true' }}
        run: |
          cp LICENSE ./build/bin/
          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*

      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ env.CREATE_ARTIFACTS == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
@@ -221,14 +222,14 @@ jobs:

      - name: Pack artifacts
        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ env.CREATE_ARTIFACTS == 'true' }}
        run: |
          cp LICENSE ./build/bin/
          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*

      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ env.CREATE_ARTIFACTS == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
@@ -790,14 +791,14 @@ jobs:

      - name: Pack artifacts
        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ env.CREATE_ARTIFACTS == 'true' }}
        run: |
          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
          Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*

      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ env.CREATE_ARTIFACTS == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
@@ -916,10 +917,10 @@ jobs:
        shell: cmd
        run: |
          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DLLAMA_BUILD_SERVER=ON ^
-            -DGGML_NATIVE=OFF ^
-            -DGGML_CUDA=ON ^
+          cmake -S . -B build -G "Ninja Multi-Config" \
+            -DLLAMA_BUILD_SERVER=ON \
+            -DGGML_NATIVE=OFF \
+            -DGGML_CUDA=ON \
            -DGGML_RPC=ON
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
@@ -940,19 +941,19 @@ jobs:

      - name: Pack artifacts
        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ env.CREATE_ARTIFACTS == 'true' }}
        run: |
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*

      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ env.CREATE_ARTIFACTS == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
          name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip

      - name: Copy and pack Cuda runtime
-        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+        if: ${{ env.CREATE_ARTIFACTS == 'true' }}
        run: |
          echo "Cuda install location: ${{ env.CUDA_PATH }}"
          $dst='.\build\bin\cudart\'
@@ -961,7 +962,7 @@ jobs:
          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*

      - name: Upload Cuda runtime
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ env.CREATE_ARTIFACTS == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
@@ -1008,7 +1009,7 @@ jobs:

      - name: Build the release package
        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ env.CREATE_ARTIFACTS == 'true' }}
        run: |
          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"

@@ -1033,7 +1034,7 @@ jobs:
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*

      - name: Upload the release package
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ env.CREATE_ARTIFACTS == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
@@ -1073,16 +1074,11 @@ jobs:
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DGGML_HIP=ON `
-            -DGGML_RPC=ON
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}

  windows-latest-cmake-hip-release:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' || contains(github.event.head_commit.message, '[pack]') }}
    runs-on: windows-latest

    strategy:
@@ -1116,13 +1112,7 @@ jobs:
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
-            -DGGML_HIP=ON `
-            -DGGML_RPC=ON
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
@@ -1475,37 +1465,3 @@ jobs:
 #          popd
 #          emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
 #          make
-
-  openEuler-latest-cmake-cann:
-    if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
-    defaults:
-      run:
-       shell: bash -el {0}
-    runs-on: ubuntu-24.04-arm
-    strategy:
-      matrix:
-        cann:
-          - '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
-        device:
-          - 'ascend910b3'
-        build:
-          - 'Release'
-    container: ascendai/cann:${{ matrix.cann }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        run: |
-          yum update -y
-          yum install -y git gcc gcc-c++ make cmake
-
-      - name: Build
-        run: |
-          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-
-          cmake -S . -B build \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-              -DGGML_CANN=on \
-              -DSOC_TYPE=${{ matrix.device }}
-          cmake --build build -j $(nproc)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,8 +50,6 @@ endif()
 if (MSVC)
    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
-    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/bigobj>")
-    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
 endif()

 #
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1427,16 +1427,16 @@ struct server_queue {
    int post(server_task task, bool front = false) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        GGML_ASSERT(task.id != -1);
-        // if this is cancel task make sure to clean up pending tasks
-        if (task.type == SERVER_TASK_TYPE_CANCEL) {
-            cleanup_pending_task(task.id_target);
-        }
        QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
        if (front) {
            queue_tasks.push_front(std::move(task));
        } else {
            queue_tasks.push_back(std::move(task));
        }
+        // if this is cancel task make sure to clean up pending tasks
+        if (task.type == SERVER_TASK_TYPE_CANCEL) {
+            cleanup_pending_task(task.id_target);
+        }
        condition_tasks.notify_one();
        return task.id;
    }
@@ -1448,16 +1448,16 @@ struct server_queue {
            if (task.id == -1) {
                task.id = id++;
            }
-            // if this is cancel task make sure to clean up pending tasks
-            if (task.type == SERVER_TASK_TYPE_CANCEL) {
-                cleanup_pending_task(task.id_target);
-            }
            QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
            if (front) {
                queue_tasks.push_front(std::move(task));
            } else {
                queue_tasks.push_back(std::move(task));
            }
+            // if this is cancel task make sure to clean up pending tasks
+            if (task.type == SERVER_TASK_TYPE_CANCEL) {
+                cleanup_pending_task(task.id_target);
+            }
        }
        condition_tasks.notify_one();
        return 0;
@@ -1554,10 +1554,10 @@ struct server_queue {
    }

 private:
-    void cleanup_pending_task(int id_target) {
+    void cleanup_pending_task(int id_task) {
        // no need lock because this is called exclusively by post()
-        auto rm_func = [id_target](const server_task & task) {
-            return task.id_target == id_target;
+        auto rm_func = [id_task](const server_task & task) {
+            return task.id_target == id_task;
        };
        queue_tasks.erase(
            std::remove_if(queue_tasks.begin(),          queue_tasks.end(),          rm_func),
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -154,8 +154,6 @@ option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashA
 option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})

 option(GGML_HIP                             "ggml: use HIP"                                   OFF)
-option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
-option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -131,10 +131,6 @@ typedef float dfloat; // dequantize float
 typedef float2 dfloat2;
 #endif // GGML_CUDA_F16

-#if (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
-#define GGML_USE_VMM
-#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
-
 #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
 #define FP16_AVAILABLE
 #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
@@ -592,7 +588,7 @@ struct ggml_tensor_extra_gpu {
 };


-#if ((CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)) || defined(GGML_HIP_GRAPHS)
+#if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
 #define USE_CUDA_GRAPH
 #endif

--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -62,7 +62,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 [[noreturn]]
 void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
    int id = -1; // in case cudaGetDevice fails
-    (void)cudaGetDevice(&id);
+    cudaGetDevice(&id);

    GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
    GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
@@ -152,7 +152,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
    for (int id = 0; id < info.device_count; ++id) {
        int device_vmm = 0;

-#if defined(GGML_USE_VMM)
+#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
        CUdevice device;
        CU_CHECK(cuDeviceGet(&device, id));
        CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -164,7 +164,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
            alloc_prop.location.id = id;
            CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
        }
-#endif // defined(GGML_USE_VMM)
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
        info.devices[id].vmm = !!device_vmm;

        cudaDeviceProp prop;
@@ -300,7 +300,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
 };

 // pool with virtual memory
-#if defined(GGML_USE_VMM)
+#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
 struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
    static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB

@@ -309,9 +309,6 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
    size_t pool_used = 0;
    size_t pool_size = 0;
    size_t granularity;
-#if defined(GGML_USE_HIP)
-    std::vector<std::pair<CUdeviceptr, size_t>> mappings;
-#endif

    explicit ggml_cuda_pool_vmm(int device) :
        device(device),
@@ -320,14 +317,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {

    ~ggml_cuda_pool_vmm() {
        if (pool_addr != 0) {
-#if defined(GGML_USE_HIP)
-            // Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
-            for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
-                CU_CHECK(cuMemUnmap(mapping.first, mapping.second));
-            }
-#else
            CU_CHECK(cuMemUnmap(pool_addr, pool_size));
-#endif
            CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
        }
    }
@@ -360,11 +350,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
            }

            // map at the end of the pool
-            CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
-            CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
-#if defined(GGML_USE_HIP)
-            mappings.push_back({start_ptr, reserve_size});
-#endif
+            CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0));

            // the memory allocation handle is no longer needed after mapping
            CU_CHECK(cuMemRelease(handle));
@@ -374,7 +360,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
            access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
            access.location.id = device;
            access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-            CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
+            CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1));

            // add to the pool
            pool_size += reserve_size;
@@ -386,7 +372,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {

        GGML_ASSERT(pool_addr != 0);

-        void * ptr = (void *) ((CUdeviceptr)((char *)(pool_addr) + pool_used));
+        void * ptr = (void *) (pool_addr + pool_used);
        *actual_size = size;
        pool_used += size;

@@ -405,17 +391,17 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
        pool_used -= size;

        // all deallocations must be in reverse order of the allocations
-        GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
+        GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
    }
 };
-#endif // defined(GGML_USE_VMM)
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)

 std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
-#if defined(GGML_USE_VMM)
+#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
    if (ggml_cuda_info().devices[device].vmm) {
        return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
    }
-#endif // defined(GGML_USE_VMM)
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
    return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
 }

@@ -561,7 +547,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
    cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
    if (err != cudaSuccess) {
        // clear the error
-        (void)cudaGetLastError();
+        cudaGetLastError();
        GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
        return nullptr;
    }
@@ -976,7 +962,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
    cudaError_t err = cudaMallocHost((void **) &ptr, size);
    if (err != cudaSuccess) {
        // clear the error
-        (void)cudaGetLastError();
+        cudaGetLastError();
        GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
                           size / 1024.0 / 1024.0, cudaGetErrorString(err));
        return nullptr;
@@ -1223,7 +1209,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
                        CUDA_CHECK(err);
                    } else {
                        // reset the error
-                        (void)cudaGetLastError();
+                        cudaGetLastError();
                    }
                } else {
                    cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
@@ -1231,7 +1217,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
                        CUDA_CHECK(err);
                    } else {
                        // reset the error
-                        (void)cudaGetLastError();
+                        cudaGetLastError();
                    }
                }
            }
@@ -2466,7 +2452,7 @@ static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vecto
                    if (stat == cudaErrorInvalidDeviceFunction) {
                        // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
                        // We don't need to update blas nodes, so clear error and move on.
-                        (void)cudaGetLastError();
+                        cudaGetLastError();
                    } else {
                        GGML_ASSERT(stat == cudaSuccess);
                    }
@@ -2521,20 +2507,14 @@ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx,
 static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {

    cudaGraphExecUpdateResultInfo result_info;
-#ifdef __HIP_PLATFORM_AMD__
-    hipGraphNode_t errorNode;
-    hipError_t stat = hipGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
-#else
    cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
-#endif
    if (stat == cudaErrorGraphExecUpdateFailure) {
 #ifndef NDEBUG
        GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
 #endif
-
        // The pre-existing graph exec cannot be updated due to violated constraints
        // so instead clear error and re-instantiate
-        (void)cudaGetLastError();
+        cudaGetLastError();
        CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
        cuda_ctx->cuda_graph->instance = nullptr;
        CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
@@ -2762,7 +2742,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
    cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
    if (err != cudaSuccess) {
        // clear the error
-        (void)cudaGetLastError();
+        cudaGetLastError();

        GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
                           size / 1024.0 / 1024.0, cudaGetErrorString(err));
@@ -2782,7 +2762,7 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
    cudaError_t err = cudaHostUnregister(buffer);
    if (err != cudaSuccess) {
        // clear the error
-        (void)cudaGetLastError();
+        cudaGetLastError();
    }
 }

@@ -3250,7 +3230,7 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
        features.push_back({ "FORCE_CUBLAS", "1" });
    #endif

-    #ifndef GGML_USE_VMM
+    #ifdef GGML_CUDA_NO_VMM
        features.push_back({ "NO_VMM", "1" });
    #endif

--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -19,12 +19,6 @@
 #define CUBLAS_TF32_TENSOR_OP_MATH 0
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_32F  HIPBLAS_R_32F
-#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
-#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
-#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
-#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
-#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
-#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
 #define cublasCreate hipblasCreate
@@ -80,21 +74,6 @@
 #define cudaMemGetInfo hipMemGetInfo
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
-#define cuDeviceGet hipDeviceGet
-#define CUdevice hipDevice_t
-#define CUdeviceptr hipDeviceptr_t
-#define cuMemUnmap hipMemUnmap
-#define CUmemAccessDesc hipMemAccessDesc
-#define cuMemAddressFree hipMemAddressFree
-#define cuMemRelease hipMemRelease
-#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
-#define cuMemCreate hipMemCreate
-#define cuMemAddressReserve hipMemAddressReserve
-#define cuMemMap hipMemMap
-#define cuMemSetAccess hipMemSetAccess
-#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
-#define CUmemAllocationProp hipMemAllocationProp
-#define cuDeviceGetAttribute hipDeviceGetAttribute
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 #define cudaStreamDestroy hipStreamDestroy
 #define cudaStreamFireAndForget hipStreamFireAndForget
@@ -102,28 +81,6 @@
 #define cudaStreamPerThread hipStreamPerThread
 #define cudaStreamSynchronize hipStreamSynchronize
 #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
-#define cudaGraphExec_t hipGraphExec_t
-#define cudaGraphNode_t hipGraphNode_t
-#define cudaKernelNodeParams hipKernelNodeParams
-#define cudaKernelNodeParams hipKernelNodeParams
-#define cudaGraphExecDestroy hipGraphExecDestroy
-#define cudaGraphLaunch hipGraphLaunch
-#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
-#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
-#define cudaGraphNodeType hipGraphNodeType
-#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
-#define cudaGraphInstantiate hipGraphInstantiate
-#define cudaStreamEndCapture hipStreamEndCapture
-#define cudaGraphDestroy hipGraphDestroy
-#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
-#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
-#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
-#define cudaGraphNodeGetType hipGraphNodeGetType
-#define cudaGraphGetNodes hipGraphGetNodes
-#define cudaGraphExecUpdate hipGraphExecUpdate
-#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
-#define cudaStreamBeginCapture hipStreamBeginCapture
-#define cudaGraph_t hipGraph_t
 #define cudaStream_t hipStream_t
 #define cudaSuccess hipSuccess
 #define __trap() do { abort(); __builtin_unreachable(); } while(0)
--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@@ -92,14 +92,6 @@ if (GGML_CUDA_NO_PEER_COPY)
    add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
 endif()

-if (GGML_HIP_GRAPHS)
-    add_compile_definitions(GGML_HIP_GRAPHS)
-endif()
-
-if (GGML_HIP_NO_VMM)
-    add_compile_definitions(GGML_HIP_NO_VMM)
-endif()
-
 if (CXX_IS_HIPCC)
    set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
    target_link_libraries(ggml-hip PRIVATE hip::device)
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -85,10 +85,6 @@ struct vk_pipeline_struct {
    uint32_t parameter_count;
    std::array<uint32_t, 3> wg_denoms;
    uint32_t align;
-    // set to true to request the pipeline is compiled after the dryrun
-    bool needed {};
-    // set to true when the shader has been compiled
-    bool compiled {};
 };

 typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
@@ -190,11 +186,8 @@ struct vk_device_struct {
    bool mul_mat_id_m;
    bool mul_mat_id_s;

-    // set to true to indicate that some shaders need to be compiled after the dryrun
-    bool need_compiles {};
-
-    vk_matmul_pipeline pipeline_matmul_f32 {};
-    vk_matmul_pipeline pipeline_matmul_f32_f16 {};
+    vk_matmul_pipeline pipeline_matmul_f32;
+    vk_matmul_pipeline pipeline_matmul_f32_f16;
    vk_matmul_pipeline2 pipeline_matmul_f16;
    vk_matmul_pipeline2 pipeline_matmul_f16_f32;
    vk_pipeline pipeline_matmul_split_k_reduce;
@@ -202,7 +195,7 @@ struct vk_device_struct {
    vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT];
    vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];

-    vk_matmul_pipeline pipeline_matmul_id_f32 {};
+    vk_matmul_pipeline pipeline_matmul_id_f32;
    vk_matmul_pipeline2 pipeline_matmul_id_f16;
    vk_matmul_pipeline2 pipeline_matmul_id_f16_f32;

@@ -783,6 +776,13 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
    GGML_ASSERT(parameter_count > 0);
    GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT

+    pipeline = std::make_shared<vk_pipeline_struct>();
+    pipeline->name = name;
+    pipeline->parameter_count = parameter_count;
+    pipeline->push_constant_size = push_constant_size;
+    pipeline->wg_denoms = wg_denoms;
+    pipeline->align = align;
+
    vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
    pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);

@@ -865,7 +865,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
    }

    pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
-    pipeline->compiled = true;

    {
        std::lock_guard<std::mutex> guard(device->mutex);
@@ -876,6 +875,12 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
        std::lock_guard<std::mutex> guard(compile_count_mutex);
        assert(compile_count > 0);
        compile_count--;
+
+        // "Progress bar" for shader compiles
+        static uint32_t total_compile_count = 0;
+        if ((total_compile_count++ % 10) == 0) {
+            std::cerr << ".";
+        }
    }
    compile_count_cond.notify_all();
 }
@@ -901,10 +906,6 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
 static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
    VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
    device->pipeline_descriptor_set_requirements[pipeline->name] += n;
-    if (!pipeline->compiled) {
-        pipeline->needed = true;
-        device->need_compiles = true;
-    }
 }

 static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
@@ -1387,6 +1388,8 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
 static void ggml_vk_load_shaders(vk_device& device) {
    VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");

+    std::cerr << "ggml_vulkan: Compiling shaders";
+
    // some shaders have a minimum subgroup size
    const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
    const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u);
@@ -1524,33 +1527,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
        }
    }

-    if (!device->pipeline_matmul_f32) {
-        device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
-    }
-    if (!device->pipeline_matmul_f32_f16) {
-        device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
-    }
-    if (!device->pipeline_matmul_id_f32) {
-        device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
-    }
+    device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
+    device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
+
+    device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();

    std::vector<std::future<void>> compiles;
    auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
                                              uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
                                              uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
-
-        if (!pipeline) {
-            pipeline = std::make_shared<vk_pipeline_struct>();
-            pipeline->name = name;
-            pipeline->parameter_count = parameter_count;
-            pipeline->push_constant_size = push_constant_size;
-            pipeline->wg_denoms = wg_denoms;
-            pipeline->align = align;
-        }
-
-        if (!pipeline->needed || pipeline->compiled) {
-            return;
-        }
        {
            // wait until fewer than N compiles are in progress
            uint32_t N = std::max(1u, std::thread::hardware_concurrency());
@@ -2065,7 +2050,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    for (auto &c : compiles) {
        c.wait();
    }
-    device->need_compiles = false;
+    std::cerr << "Done!" << std::endl;
 }

 static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
@@ -7671,9 +7656,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
    }
-    if (ctx->device->need_compiles) {
-        ggml_vk_load_shaders(ctx->device);
-    }
    ggml_vk_preallocate_buffers(ctx);
    ggml_pipeline_allocate_descriptor_sets(ctx->device);
Author	SHA1	Message	Date
slaren	de9d2c6f09	test [pack]	2025-01-24 23:24:31 +01:00
slaren	df0edbb0be	test	2025-01-24 22:03:31 +01:00
slaren	202b1e7105	ci : allow creating artifacts on PRs on demand	2025-01-24 21:36:11 +01:00