Compare commits

...

11 Commits
b7965 ... b7976

Author SHA1 Message Date
Georgi Gerganov
972f323e73 revert : "[Model] Qwen3.5 dense and MoE support (no vision) (#19435)" (#19453)
This reverts commit 39bf692af1.
2026-02-09 14:57:51 +02:00
Kevin Pouget
f5e7734ff2 ggml-virtgpu: add backend documentation (#19354)
* ggml-virtgpu: add backend documentation

Assisted-by-AI: Claude Code

* CODEOWNERS: add /docs/backend/GGML-VirtGPU/ -> kpouget

* README: add the link to docs/backend/GGML-VirtGPU/ggml-virt.md

* docs/ggml-virt: add link to testing + configuration

* Revert "CODEOWNERS: add /docs/backend/GGML-VirtGPU/ -> kpouget"

This reverts commit 8ece8e72e2.

* drop the ggml- prefix

* s/ggerganov/ggml-org

* Relocate VirtGPU.md

* reorganize the text

* turn turn the ascii diagram into a mermaid

* README.md: update the link to the main doc
2026-02-09 20:15:42 +08:00
Hugo
1e8924fd65 cmake : add variable to skip installing tests (#19370)
When packaging downstream, there's usually little point in installing
test. The default behaviour remains the same.
2026-02-09 07:12:02 +01:00
Piotr Wilkin (ilintar)
39bf692af1 [Model] Qwen3.5 dense and MoE support (no vision) (#19435)
* Unified delta net handling

* Remove old methods.

* Refactor and optimize

* Adapt autoregressive version from @ymcki

* Change to decay mask approach

* Fix bad permute

* Qwen 3.5 support

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Further fixes

* Use inheritance, remove unneeded conts

* Not like this!

* Remove ggml.h explicit import

* Remove transformers, fix the views

* ACTUALLY fix views, make super calls explicit in conversion.

* Fix conversion again

* Remove extra ggml.h imports

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-02-09 00:24:08 +01:00
Oliver Simons
e06088da0f CUDA: Fix non-contig rope (#19338)
* Rename variables + fix rope_neox

Seems memory layout is shared with Vulkan so we can port fix from
https://github.com/ggml-org/llama.cpp/pull/19299

* Fix rope_multi

* Fix rope_vision

* Fix rope_norm

* Rename ne* to ne0* for consistent variable naming

* cont : consistent stride names

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-02-08 15:12:51 +02:00
Adrien Gallouët
5fa1c190d9 rpc : update from common.cpp (#19400)
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-02-08 09:06:45 +01:00
Georgi Gerganov
eb449cdfa4 server : improve context checkpoint logic (#19408) 2026-02-08 09:40:04 +02:00
ddh0
5999b50eb0 llama-quantize : cleanup --help output (#19317)
* cleanup `llama-quantize --help` output

some much needed TLC

* remove future argument

oops, spoiler

* cleanup of cleanup
2026-02-08 09:22:38 +02:00
Sigbjørn Skjæret
9a5f57795c ci : remove server job from webui and move slow test (#19424)
* remove server job from webui and move slow test

* use pip-install option
2026-02-08 01:20:00 +01:00
Georgi Gerganov
96441c955e ci : use -j param correctly when building with sanitizers (#19411)
* ci : use less jobs when building with sanitizers

* cont : fix nproc

* cont : fix the fix

* cont : simplify
2026-02-07 23:50:47 +01:00
Georgi Gerganov
8872ad2125 metal : consolidate bin kernels (#19390)
* metal : refactor bin kernels

* cont

* cont : fix cv
2026-02-07 10:35:56 +02:00
19 changed files with 1142 additions and 595 deletions

View File

@@ -295,6 +295,7 @@ jobs:
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
- name: Build (no OpenMP)
@@ -307,6 +308,7 @@ jobs:
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DGGML_OPENMP=OFF
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
- name: Test

View File

@@ -8,10 +8,6 @@ on:
description: 'Commit SHA1 to build'
required: false
type: string
slow_tests:
description: 'Run slow tests'
required: true
type: boolean
push:
branches:
- master
@@ -101,119 +97,3 @@ jobs:
if: ${{ always() && steps.playwright.conclusion == 'success' }}
run: npm run test:e2e
working-directory: tools/server/webui
server-build:
runs-on: ubuntu-latest
strategy:
matrix:
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
build_type: [RelWithDebInfo]
include:
- build_type: Release
sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
steps:
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get -y install \
build-essential \
xxd \
git \
cmake \
curl \
wget \
language-pack-en \
libssl-dev
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Python setup
id: setup_python
uses: actions/setup-python@v6
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r tools/server/tests/requirements.txt
- name: Setup Node.js for WebUI
uses: actions/setup-node@v6
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Install WebUI dependencies
run: npm ci
working-directory: tools/server/webui
- name: Build WebUI
run: npm run build
working-directory: tools/server/webui
- name: Build (no OpenMP)
id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_OPENMP=OFF ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build_sanitizers
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build
if: ${{ matrix.sanitizer == '' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Tests
id: server_integration_tests
if: ${{ matrix.sanitizer == '' }}
env:
GITHUB_ACTIONS: "true"
run: |
cd tools/server/tests
./tests.sh
- name: Tests (sanitizers)
id: server_integration_tests_sanitizers
if: ${{ matrix.sanitizer != '' }}
run: |
cd tools/server/tests
LLAMA_SANITIZE=1 ./tests.sh
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd tools/server/tests
SLOW_TESTS=1 ./tests.sh

View File

@@ -81,18 +81,14 @@ jobs:
-DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
-DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
-DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Python setup
id: setup_python
uses: actions/setup-python@v6
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r tools/server/tests/requirements.txt
pip-install: -r tools/server/tests/requirements.txt
- name: Tests
id: server_integration_tests
@@ -102,6 +98,14 @@ jobs:
export ${{ matrix.extra_args }}
pytest -v -x -m "not slow"
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd tools/server/tests
export ${{ matrix.extra_args }}
SLOW_TESTS=1 pytest -v -x
server-windows:
runs-on: windows-2022
@@ -124,11 +128,7 @@ jobs:
uses: actions/setup-python@v6
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r tools/server/tests/requirements.txt
pip-install: -r tools/server/tests/requirements.txt
- name: Tests
id: server_integration_tests

View File

@@ -109,6 +109,7 @@ option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
# 3rd party libs
option(LLAMA_HTTPLIB "llama: httplib for downloading functionality" ON)

View File

@@ -288,6 +288,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
| [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |
| [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
## Obtaining and quantizing models

180
docs/backend/VirtGPU.md Normal file
View File

@@ -0,0 +1,180 @@
# GGML-VirtGPU Backend
The GGML-VirtGPU backend enables GGML applications to run machine
learning computations on host hardware while the application itself
runs inside a virtual machine. It uses host-guest shared memory to
efficiently share data buffers between the two sides.
This backend relies on the virtio-gpu, and VirglRenderer API Remoting
(APIR) component. The backend is split into two libraries:
- a GGML implementation (the "remoting frontend"), running in the
guest and interacting with the virtgpu device
- a VirglRenderer APIR compatible library (the "remoting backend"),
running in the host and interacting with Virglrenderer and an actual
GGML device backend.
## OS support
| OS | Status | Backend | CI testing | Notes
| -------- | ----------------- | ----------- | ----------- | -----
| MacOS 14 | Supported | ggml-metal | X | Working when compiled on MacOS 14
| MacOS 15 | Supported | ggml-metal | X | Working when compiled on MacOS 14 or MacOS 15
| MacOS 26 | Not tested | | |
| Linux | Under development | ggml-vulkan | not working | Working locally, CI running into deadlocks
## Architecture Overview
The GGML-VirtGPU backend consists of three main components:
```mermaid
graph TD
%% Nodes
subgraph GuestVM ["Guest VM - Frontend"]
App([GGML Application<br/>llama.cpp, etc.])
direction TB
Interface[GGML Backend Interface]
Comm["GGML-VirtGPU<br/>(hypercalls + shared mem)"]
App --> Interface
Interface --> Comm
end
API[virtio-gpu / virglrenderer API]
subgraph HostSystem [Host System - Backend]
direction TB
Dispatcher[GGML-VirtGPU-Backend]
BackendLib[GGML Backend library<br/>Metal / Vulkan / CPU / ...]
Dispatcher --> BackendLib
end
%% Connections
Comm --> API
API --> HostSystem
```
### Key Components
1. **Guest-side Frontend** (`ggml-virtgpu/`): Implements the GGML backend interface and forwards operations to the host
2. **Host-side Backend** (`ggml-virtgpu/backend/`): Receives forwarded operations and executes them on actual hardware backends
3. **Communication Layer**: Uses virtio-gpu hypercalls and shared memory for efficient data transfer
## Features
- **Dynamic backend loading** on the host side (CPU, CUDA, Metal, etc.)
- **Zero-copy data transfer** via host-guest shared memory pages
## Communication Protocol
### Hypercalls and Shared Memory
The backend uses two primary communication mechanisms:
1. **Hypercalls (`DRM_IOCTL_VIRTGPU_EXECBUFFER`)**: Trigger remote execution from guest to host
2. **Shared Memory Pages**: Zero-copy data transfer for tensors and parameters
#### Shared Memory Layout
Each connection uses two shared memory buffers:
- **Data Buffer** (24 MiB): For command/response data and tensor transfers
- **Reply Buffer** (16 KiB): For command replies and status information
- **Data Buffers**: Dynamically allocated host-guest shared buffers
served as GGML buffers.
### APIR Protocol
The Virglrender API Remoting protocol defines three command types:
- `HANDSHAKE`: Protocol version negotiation and capability discovery
- `LOADLIBRARY`: Dynamic loading of backend libraries on the host
- `FORWARD`: API function call forwarding
### Binary Serialization
Commands and data are serialized using a custom binary protocol with:
- Fixed-size encoding for basic types
- Variable-length arrays with size prefixes
- Buffer bounds checking
- Error recovery mechanisms
## Supported Operations
### Device Operations
- Device enumeration and capability queries
- Memory information (total/free)
- Backend type detection
### Buffer Operations
- Buffer allocation and deallocation
- Tensor data transfer (host ↔ guest)
- Memory copying and clearing
### Computation Operations
- Graph execution forwarding
## Build Requirements
### Guest-side Dependencies
- `libdrm` for DRM/virtio-gpu communication
- C++20 compatible compiler
- CMake 3.14+
### Host-side Dependencies
- virglrenderer with APIR support (pending upstream review)
- Target backend libraries (libggml-metal, libggml-vulkan, etc.)
## Configuration
### Environment Variables
- `GGML_VIRTGPU_BACKEND_LIBRARY`: Path to the host-side backend library
- `GGML_VIRTGPU_DEBUG`: Enable debug logging
### Build Options
- `GGML_VIRTGPU`: Enable the VirtGPU backend (`ON` or `OFF`, default: `OFF`)
- `GGML_VIRTGPU_BACKEND`: Build the host-side backend component (`ON`, `OFF` or `ONLY`, default: `OFF`)
### System Requirements
- VM with virtio-gpu support
- VirglRenderer with APIR patches
- Compatible backend libraries on host
## Limitations
- **VM-specific**: Only works in virtual machines with virtio-gpu support
- **Host dependency**: Requires properly configured host-side backend
- **Latency**: Small overhead from VM escaping for each operation
* This work is pending upstream changes in the VirglRenderer
project.
* The backend can be tested with Virglrenderer compiled from source
using this PR:
https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590
* This work is pending changes in the VMM/hypervisor running the
virtual machine, which need to know how to route the newly
introduced APIR capset.
* The environment variable `VIRGL_ROUTE_VENUS_TO_APIR=1` allows
using the Venus capset, until the relevant hypervisors have been
patched. However, setting this flag breaks the Vulkan/Venus normal
behavior.
* The environment variable `GGML_REMOTING_USE_APIR_CAPSET` tells the
`ggml-virtgpu` backend to use the APIR capset. This will become
the default when the relevant hypervisors have been patched.
* This work focused on improving the performance of llama.cpp running
on MacOS containers, and is mainly tested on this platform. The
linux support (via `krun`) is in progress.
## See Also
- [Development and Testing](VirtGPU/development.md)
- [Backend configuration](VirtGPU/configuration.md)

View File

@@ -0,0 +1,174 @@
# GGML-VirtGPU Backend Configuration
This document describes the environment variables used by the ggml-virtgpu backend system, covering both the frontend (guest-side) and backend (host-side) components.
## Environment Variables Overview
The ggml-virtgpu backend uses environment variables for configuration across three main components:
- **Frontend (Guest)**: GGML applications running in VMs
- **Hypervisor**: Virglrenderer/APIR system
- **Backend (Host)**: Host-side GGML backend integration
## Frontend (Guest-side) Configuration
### GGML_REMOTING_USE_APIR_CAPSET
- **Location**: `ggml/src/ggml-virtgpu/virtgpu.cpp`
- **Type**: Boolean flag (presence-based)
- **Purpose**: Controls which virtio-gpu capability set to use for communication
- **Values**:
- Set (any value): Use the APIR capset (long-term setup)
- Unset: Use the Venus capset (easier for testing with an unmodified hypervisor)
- **Default**: Unset (Venus capset)
- **Usage**:
```bash
export GGML_REMOTING_USE_APIR_CAPSET=1 # Use APIR capset
# or leave unset for Venus capset
```
## Hypervisor (Virglrenderer/APIR) Configuration
These environment variables are used during the transition phase for
running with an unmodified hypervisor (not supporting the
VirglRenderer APIR component). They will be removed in the future, and
the hypervisor will instead configure VirglRenderer with the APIR
_Configuration Key_.
### VIRGL_APIR_BACKEND_LIBRARY
- **Location**: `virglrenderer/src/apir/apir-context.c`
- **Configuration Key**: `apir.load_library.path`
- **Type**: File path string
- **Purpose**: Path to the APIR backend library that virglrenderer should dynamically load
- **Required**: Yes
- **Example**:
```bash
export VIRGL_APIR_BACKEND_LIBRARY="/path/to/libggml-remotingbackend.so"
```
### VIRGL_ROUTE_VENUS_TO_APIR
- **Location**: `virglrenderer/src/apir/apir-renderer.h`
- **Type**: Boolean flag (presence-based)
- **Purpose**: Temporary workaround to route Venus capset calls to APIR during hypervisor transition period
- **Status**: will be removed once hypervisors support APIR natively
- **Warning**: Breaks normal Vulkan/Venus functionality
- **Usage**:
```bash
export VIRGL_ROUTE_VENUS_TO_APIR=1 # For testing with an unmodified hypervisor
```
### VIRGL_APIR_LOG_TO_FILE
- **Location**: `virglrenderer/src/apir/apir-renderer.c`
- **Environment Variable**: `VIRGL_APIR_LOG_TO_FILE`
- **Type**: File path string
- **Purpose**: Enable debug logging from the VirglRenderer APIR component to specified file
- **Required**: No (optional debugging)
- **Default**: Logging to `stderr`
- **Usage**:
```bash
export VIRGL_APIR_LOG_TO_FILE="/tmp/apir-debug.log"
```
## Backend (Host-side) Configuration
These environment variables are used during the transition phase for
running with an unmodified hypervisor (not supporting the
VirglRenderer APIR component). They will be removed in the future, and
the hypervisor will instead configure VirglRenderer with the APIR
_Configuration Key_.
### APIR_LLAMA_CPP_GGML_LIBRARY_PATH
- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp`
- **Environment Variable**: `APIR_LLAMA_CPP_GGML_LIBRARY_PATH`
- **Configuration Key**: `ggml.library.path`
- **Type**: File path string
- **Purpose**: Path to the actual GGML backend library (Metal, CUDA, Vulkan, etc.)
- **Required**: **Yes** - backend initialization fails without this
- **Examples**:
```bash
# macOS with Metal backend
export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-metal.dylib"
# Linux with CUDA backend
export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-cuda.so"
# macOS or Linux with Vulkan backend
export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-vulkan.so"
```
### APIR_LLAMA_CPP_GGML_LIBRARY_REG
- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp`
- **Environment Variable**: `APIR_LLAMA_CPP_GGML_LIBRARY_REG`
- **Configuration Key**: `ggml.library.reg`
- **Type**: Function symbol name string
- **Purpose**: Name of the backend registration function to call after loading the library
- **Required**: No (defaults to `ggml_backend_init`)
- **Default**: `ggml_backend_init`
- **Examples**:
```bash
# Metal backend
export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_metal_reg"
# CUDA backend
export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_cuda_reg"
# Vulkan backend
export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_vulkan_reg"
# Generic fallback (default)
# export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_init"
```
### APIR_LLAMA_CPP_LOG_TO_FILE
- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp:62`
- **Environment Variable**: `APIR_LLAMA_CPP_LOG_TO_FILE`
- **Type**: File path string
- **Purpose**: Enable debug logging from the GGML backend to specified file
- **Required**: No (optional debugging)
- **Usage**:
```bash
export APIR_LLAMA_CPP_LOG_TO_FILE="/tmp/ggml-backend-debug.log"
```
## Configuration Flow
The configuration system works as follows:
1. **Hypervisor Setup**: Virglrenderer loads the APIR backend library specified by `VIRGL_APIR_BACKEND_LIBRARY`
2. **Context Creation**: When an APIR context is created, it populates a configuration table with environment variables:
- `apir.load_library.path` ← `VIRGL_APIR_BACKEND_LIBRARY`
- `ggml.library.path` ← `APIR_LLAMA_CPP_GGML_LIBRARY_PATH`
- `ggml.library.reg` ← `APIR_LLAMA_CPP_GGML_LIBRARY_REG`
- this step will eventually be performed by the hypervisor itself, with command-line arguments instead of environment variables.
3. **Backend Initialization**: The backend queries the configuration via callbacks:
- `virgl_cbs->get_config(ctx_id, "ggml.library.path")` returns the library path
- `virgl_cbs->get_config(ctx_id, "ggml.library.reg")` returns the registration function
4. **Library Loading**: The backend dynamically loads and initializes the specified GGML library
## Error Messages
Common error scenarios and their messages:
- **Missing library path**: `"cannot open the GGML library: env var 'APIR_LLAMA_CPP_GGML_LIBRARY_PATH' not defined"`
- **Missing registration function**: `"cannot register the GGML library: env var 'APIR_LLAMA_CPP_GGML_LIBRARY_REG' not defined"`
## Example Complete Configuration
Here's an example configuration for a macOS host with Metal backend:
```bash
# Hypervisor environment
export VIRGL_APIR_BACKEND_LIBRARY="/opt/llama.cpp/lib/libggml-virtgpu-backend.dylib"
# Backend configuration
export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-metal.dylib"
export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_metal_reg"
# Optional logging
export VIRGL_APIR_LOG_TO_FILE="/tmp/apir.log"
export APIR_LLAMA_CPP_LOG_TO_FILE="/tmp/ggml.log"
# Guest configuration
export GGML_REMOTING_USE_APIR_CAPSET=1
```

View File

@@ -0,0 +1,220 @@
# Development and Testing
## Development
### Code Generation
The backend uses code generation from YAML configuration:
```bash
# Regenerate protocol code
cd ggml-virtgpu/
python regenerate_remoting.py
```
### Adding New Operations
1. Add function definition to `ggmlremoting_functions.yaml`
2. Regenerate code with `regenerate_remoting.py`
3. Implement guest-side forwarding in `virtgpu-forward-*.cpp`
4. Implement host-side handling in `backend-dispatched-*.cpp`
## Testing
This document provides instructions for building and testing the GGML-VirtGPU backend on macOS with containers.
### Prerequisites
The testing setup requires:
- macOS host system
- Container runtime with `libkrun` provider (podman machine)
- Access to development patchset for VirglRenderer
### Required Patchsets
The backend requires patches that are currently under review:
- **Virglrenderer APIR upstream PR**: https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590 (for reference)
- **MacOS Virglrenderer (for krunkit)**: https://gitlab.freedesktop.org/kpouget/virglrenderer/-/tree/main-macos
- **Linux Virglrenderer (for krun)**: https://gitlab.freedesktop.org/kpouget/virglrenderer/-/tree/main-linux
### Build Instructions
#### 1. Build ggml-virtgpu-backend (Host-side, macOS)
```bash
# Build the backend that runs natively on macOS
mkdir llama.cpp
cd llama.cpp
git clone https://github.com/ggml-org/llama.cpp.git src
cd src
LLAMA_MAC_BUILD=$PWD/build/ggml-virtgpu-backend
cmake -S . -B $LLAMA_MAC_BUILD \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=ON \
-DGGML_REMOTINGBACKEND=ONLY \
-DGGML_METAL=ON
TARGETS="ggml-metal"
cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $TARGETS
# Build additional tools for native benchmarking
EXTRA_TARGETS="llama-run llama-bench"
cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $EXTRA_TARGETS
```
#### 2. Build virglrenderer (Host-side, macOS)
```bash
# Build virglrenderer with APIR support
mkdir virglrenderer
git clone https://gitlab.freedesktop.org/kpouget/virglrenderer -b main-macos src
cd src
VIRGL_BUILD_DIR=$PWD/build
# -Dvenus=true and VIRGL_ROUTE_VENUS_TO_APIR=1 route the APIR requests via the Venus backend, for easier testing without a patched hypervisor
meson setup $VIRGL_BUILD_DIR \
-Dvenus=true \
-Dapir=true
ninja -C $VIRGL_BUILD_DIR
```
#### 3. Build ggml-virtgpu (Guest-side, Linux)
Option A: Build from a script:
```bash
# Inside a Linux container
mkdir llama.cpp
git clone https://github.com/ggml-org/llama.cpp.git src
cd src
LLAMA_LINUX_BUILD=$PWD//build-virtgpu
cmake -S . -B $LLAMA_LINUX_BUILD \
-DGGML_VIRTGPU=ON
ninja -C $LLAMA_LINUX_BUILD
```
Option B: Build container image with frontend:
```bash
cat << EOF > remoting.containerfile
FROM quay.io/fedora/fedora:43
USER 0
WORKDIR /app/remoting
ARG LLAMA_CPP_REPO="https://github.com/ggml-org/llama.cpp.git"
ARG LLAMA_CPP_VERSION="master"
ARG LLAMA_CPP_CMAKE_FLAGS="-DGGML_VIRTGPU=ON"
ARG LLAMA_CPP_CMAKE_BUILD_FLAGS="--parallel 4"
RUN dnf install -y git cmake gcc gcc-c++ libcurl-devel libdrm-devel
RUN git clone "\${LLAMA_CPP_REPO}" src \\
&& git -C src fetch origin \${LLAMA_CPP_VERSION} \\
&& git -C src reset --hard FETCH_HEAD
RUN mkdir -p build \\
&& cd src \\
&& set -o pipefail \\
&& cmake -S . -B ../build \${LLAMA_CPP_CMAKE_FLAGS} \\
&& cmake --build ../build/ \${LLAMA_CPP_CMAKE_BUILD_FLAGS}
ENTRYPOINT ["/app/remoting/src/build/bin/llama-server"]
EOF
mkdir -p empty_dir
podman build -f remoting.containerfile ./empty_dir -t localhost/llama-cpp.virtgpu
```
### Environment Setup
#### Set krunkit Environment Variables
```bash
# Define the base directories (adapt these paths to your system)
VIRGL_BUILD_DIR=$HOME/remoting/virglrenderer/build
LLAMA_MAC_BUILD=$HOME/remoting/llama.cpp/build-backend
# For krunkit to load the custom virglrenderer library
export DYLD_LIBRARY_PATH=$VIRGL_BUILD_DIR/src
# For Virglrenderer to load the ggml-remotingbackend library
export VIRGL_APIR_BACKEND_LIBRARY="$LLAMA_MAC_BUILD/bin/libggml-virtgpu-backend.dylib"
# For llama.cpp remotingbackend to load the ggml-metal backend
export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="$LLAMA_MAC_BUILD/bin/libggml-metal.dylib"
export APIR_LLAMA_CPP_GGML_LIBRARY_REG=ggml_backend_metal_reg
```
#### Launch Container Environment
```bash
# Set container provider to libkrun
export CONTAINERS_MACHINE_PROVIDER=libkrun
podman machine start
```
#### Verify Environment
Confirm that krunkit is using the correct virglrenderer library:
```bash
lsof -c krunkit | grep virglrenderer
# Expected output:
# krunkit 50574 user txt REG 1,14 2273912 10849442 ($VIRGL_BUILD_DIR/src)/libvirglrenderer.1.dylib
```
### Running Tests
#### Launch Test Container
```bash
# Optional model caching
mkdir -p models
PODMAN_CACHE_ARGS="-v models:/models --user root:root --cgroupns host --security-opt label=disable -w /models"
podman run $PODMAN_CACHE_ARGS -it --rm --device /dev/dri localhost/llama-cpp.virtgpu
```
#### Test llama.cpp in Container
```bash
# Run performance benchmark
/app/remoting/build/bin/llama-bench -m ./llama3.2
```
Expected output (performance may vary):
```
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ------------: | -------------------: |
| llama 3B Q4_K - Medium | 1.87 GiB | 3.21 B | ggml-virtgpu | 99 | pp512 | 991.30 ± 0.66 |
| llama 3B Q4_K - Medium | 1.87 GiB | 3.21 B | ggml-virtgpu | 99 | tg128 | 85.71 ± 0.11 |
```
### Troubleshooting
#### SSH Environment Variable Issues
⚠️ **Warning**: Setting `DYLD_LIBRARY_PATH` from SSH doesn't work on macOS. Here is a workaround:
**Workaround 1: Replace system library**
```bash
VIRGL_BUILD_DIR=$HOME/remoting/virglrenderer/build # ⚠️ adapt to your system
BREW_VIRGL_DIR=/opt/homebrew/Cellar/virglrenderer/0.10.4d/lib
VIRGL_LIB=libvirglrenderer.1.dylib
cd $BREW_VIRGL_DIR
mv $VIRGL_LIB ${VIRGL_LIB}.orig
ln -s $VIRGL_BUILD_DIR/src/$VIRGL_LIB
```

View File

@@ -43,10 +43,15 @@ static __device__ void rope_yarn(
template <bool forward, bool has_ff, typename T, typename D>
static __global__ void rope_norm(const T * x,
D * dst,
const int ne0,
const int ne1,
const int ne00,
const int ne01,
const int ne02,
const int s01,
const int s02,
const int s03,
const int s1,
const int s2,
const int s3,
const int n_dims,
const int32_t * pos,
const float freq_scale,
@@ -59,23 +64,23 @@ static __global__ void rope_norm(const T * x,
const int set_rows_stride) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) {
if (i0 >= ne00) {
return;
}
const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
const int row_x = row_dst % ne1;
const int channel_x = row_dst / ne1;
int idst = row_dst * ne0 + i0;
const int ix = channel_x*s2 + row_x*s1 + i0;
const uint32_t i3 = row_dst / (ne01 * ne02);
const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01;
const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01;
int idst = i0 + i1 * s1 + i2 * s2 + i3 * s3;
const int ix = i0 + i1 * s01 + i2 * s02 + i3 * s03;
// Fusion optimization: ROPE + VIEW + SET_ROWS.
// The rope output is viewed as a 1D tensor and offset based on a row index in row_indices.
if (set_rows_stride != 0) {
idst = row_x * ne0 + i0;
idst += row_indices[channel_x] * set_rows_stride;
idst = i1 * s1 + i0;
idst += row_indices[i2] * set_rows_stride;
}
const auto & store_coaelsced = [&](float x0, float x1) {
@@ -92,7 +97,7 @@ static __global__ void rope_norm(const T * x,
return;
}
const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@@ -110,10 +115,15 @@ static __global__ void rope_norm(const T * x,
template <bool forward, bool has_ff, typename T, typename D>
static __global__ void rope_neox(const T * x,
D * dst,
const int ne0,
const int ne1,
const int ne00,
const int ne01,
const int ne02,
const int s01,
const int s02,
const int s03,
const int s1,
const int s2,
const int s3,
const int n_dims,
const int32_t * pos,
const float freq_scale,
@@ -126,23 +136,24 @@ static __global__ void rope_neox(const T * x,
const int set_rows_stride) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) {
if (i0 >= ne00) {
return;
}
const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
const int row_x = row_dst % ne1;
const int channel_x = row_dst / ne1;
const uint32_t i3 = row_dst / (ne01 * ne02);
const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01;
const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01;
int idst = row_dst * ne0 + i0 / 2;
const int ix = channel_x*s2 + row_x*s1 + i0/2;
int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3;
const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03;
// Fusion optimization: ROPE + VIEW + SET_ROWS.
// The rope output is viewed as a 1D tensor and offset based on a row index in row_indices.
if (set_rows_stride != 0) {
idst = row_x * ne0 + i0 / 2;
idst += row_indices[channel_x] * set_rows_stride;
idst = i1 * s1 + i0 / 2;
idst += row_indices[i2] * set_rows_stride;
}
if (i0 >= n_dims) {
@@ -152,7 +163,7 @@ static __global__ void rope_neox(const T * x,
return;
}
const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@@ -168,24 +179,42 @@ static __global__ void rope_neox(const T * x,
dst[idst + n_dims / 2] = ggml_cuda_cast<D>(x0 * sin_theta + x1 * cos_theta);
}
template<bool forward, bool has_ff, typename T>
static __global__ void rope_multi(
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections, const bool is_imrope) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
template <bool forward, bool has_ff, typename T>
static __global__ void rope_multi(const T * x,
T * dst,
const int ne00,
const int ne01,
const int ne02,
const int s01,
const int s02,
const int s03,
const int s1,
const int s2,
const int s3,
const int n_dims,
const int32_t * pos,
const float freq_scale,
const float ext_factor,
const float attn_factor,
const rope_corr_dims corr_dims,
const float theta_scale,
const float * freq_factors,
const mrope_sections sections,
const bool is_imrope) {
const int i0 = 2 * (blockDim.y * blockIdx.y + threadIdx.y);
if (i0 >= ne0) {
if (i0 >= ne00) {
return;
}
const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
const int row_x = row_dst % ne1;
const int channel_x = row_dst / ne1;
const uint32_t i3 = row_dst / (ne01 * ne02);
const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01;
const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01;
const int idst = row_dst*ne0 + i0/2;
const int ix = channel_x*s2 + row_x*s1 + i0/2;
int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3;
const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03;
if (i0 >= n_dims) {
dst[idst + i0/2 + 0] = x[ix + i0/2 + 0];
@@ -200,27 +229,24 @@ static __global__ void rope_multi(
float theta_base = 0.0;
if (is_imrope) {
if (sector % 3 == 1 && sector < 3 * sections.v[1]) { // h
theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
} else if (sector % 3 == 2 && sector < 3 * sections.v[2]) { // w
theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
} else if (sector % 3 == 0 && sector < 3 * sections.v[0]) { // t
theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
if (sector % 3 == 1 && sector < 3 * sections.v[1]) { // h
theta_base = pos[i2 + ne02 * 1] * powf(theta_scale, i0 / 2.0f);
} else if (sector % 3 == 2 && sector < 3 * sections.v[2]) { // w
theta_base = pos[i2 + ne02 * 2] * powf(theta_scale, i0 / 2.0f);
} else if (sector % 3 == 0 && sector < 3 * sections.v[0]) { // t
theta_base = pos[i2] * powf(theta_scale, i0 / 2.0f);
} else {
theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
theta_base = pos[i2 + ne02 * 3] * powf(theta_scale, i0 / 2.0f);
}
} else {
if (sector < sections.v[0]) {
theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
}
else if (sector >= sections.v[0] && sector < sec_w) {
theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
}
else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
}
else if (sector >= sec_w + sections.v[2]) {
theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
theta_base = pos[i2] * powf(theta_scale, i0 / 2.0f);
} else if (sector >= sections.v[0] && sector < sec_w) {
theta_base = pos[i2 + ne02 * 1] * powf(theta_scale, i0 / 2.0f);
} else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
theta_base = pos[i2 + ne02 * 2] * powf(theta_scale, i0 / 2.0f);
} else if (sector >= sec_w + sections.v[2]) {
theta_base = pos[i2 + ne02 * 3] * powf(theta_scale, i0 / 2.0f);
}
}
@@ -238,37 +264,53 @@ static __global__ void rope_multi(
dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
}
template<bool forward, bool has_ff, typename T>
static __global__ void rope_vision(
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims,
const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
const float theta_scale, const float * freq_factors, const mrope_sections sections) {
template <bool forward, bool has_ff, typename T>
static __global__ void rope_vision(const T * x,
T * dst,
const int ne00,
const int ne01,
const int ne02,
const int s01,
const int s02,
const int s03,
const int s1,
const int s2,
const int s3,
const int n_dims,
const int32_t * pos,
const float freq_scale,
const float ext_factor,
const float attn_factor,
const rope_corr_dims corr_dims,
const float theta_scale,
const float * freq_factors,
const mrope_sections sections) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) {
if (i0 >= ne00) {
return;
}
const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
const int row_x = row_dst % ne1;
const int channel_x = row_dst / ne1;
const uint32_t i3 = row_dst / (ne01 * ne02);
const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01;
const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01;
const int idst = row_dst*ne0 + i0/2;
const int ix = channel_x*s2 + row_x*s1 + i0/2;
int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3;
const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03;
const int sect_dims = sections.v[0] + sections.v[1];
const int sec_w = sections.v[1] + sections.v[0];
const int sector = (i0 / 2) % sect_dims;
const int sec_w = sections.v[1] + sections.v[0];
const int sector = (i0 / 2) % sect_dims;
float theta_base = 0.0;
if (sector < sections.v[0]) {
const int p = sector;
theta_base = pos[channel_x]*powf(theta_scale, p);
}
else if (sector >= sections.v[0] && sector < sec_w) {
theta_base = pos[i2] * powf(theta_scale, p);
} else if (sector >= sections.v[0] && sector < sec_w) {
const int p = sector - sections.v[0];
theta_base = pos[channel_x + ne2]*powf(theta_scale, p);
theta_base = pos[i2 + ne02] * powf(theta_scale, p);
}
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@@ -288,10 +330,15 @@ static __global__ void rope_vision(
template <bool forward, typename T, typename D>
static void rope_norm_cuda(const T * x,
D * dst,
const int ne0,
const int ne1,
const int ne00,
const int ne01,
const int ne02,
const int s01,
const int s02,
const int s03,
const int s1,
const int s2,
const int s3,
const int n_dims,
const int nr,
const int32_t * pos,
@@ -304,31 +351,36 @@ static void rope_norm_cuda(const T * x,
const int64_t * row_indices,
const int set_rows_stride,
cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0);
GGML_ASSERT(ne00 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
const int n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(nr, n_blocks_x, 1);
const float theta_scale = powf(freq_base, -2.0f/n_dims);
const float theta_scale = powf(freq_base, -2.0f / n_dims);
if (freq_factors == nullptr) {
rope_norm<forward, false><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
freq_factors, row_indices, set_rows_stride);
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
} else {
rope_norm<forward, true><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
freq_factors, row_indices, set_rows_stride);
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
}
}
template <bool forward, typename T, typename D>
static void rope_neox_cuda(const T * x,
D * dst,
const int ne0,
const int ne1,
const int ne00,
const int ne01,
const int ne02,
const int s01,
const int s02,
const int s03,
const int s1,
const int s2,
const int s3,
const int n_dims,
const int nr,
const int32_t * pos,
@@ -341,55 +393,92 @@ static void rope_neox_cuda(const T * x,
const int64_t * row_indices,
const int set_rows_stride,
cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0);
GGML_ASSERT(ne00 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
const int n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(nr, n_blocks_x, 1);
const float theta_scale = powf(freq_base, -2.0f/n_dims);
const float theta_scale = powf(freq_base, -2.0f / n_dims);
if (freq_factors == nullptr) {
rope_neox<forward, false><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
freq_factors, row_indices, set_rows_stride);
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
} else {
rope_neox<forward, true><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
freq_factors, row_indices, set_rows_stride);
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
}
}
template<bool forward, typename T>
static void rope_multi_cuda(
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, const bool is_imrope, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0);
template <bool forward, typename T>
static void rope_multi_cuda(const T * x,
T * dst,
const int ne00,
const int ne01,
const int ne02,
const int s01,
const int s02,
const int s03,
const int s1,
const int s2,
const int s3,
const int n_dims,
const int nr,
const int32_t * pos,
const float freq_scale,
const float freq_base,
const float ext_factor,
const float attn_factor,
const rope_corr_dims corr_dims,
const float * freq_factors,
const mrope_sections sections,
const bool is_imrope,
cudaStream_t stream) {
GGML_ASSERT(ne00 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
const int n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(nr, n_blocks_x, 1);
const float theta_scale = powf(freq_base, -2.0f/n_dims);
const float theta_scale = powf(freq_base, -2.0f / n_dims);
if (freq_factors == nullptr) {
rope_multi<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
} else {
rope_multi<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
}
}
template<bool forward, typename T>
static void rope_vision_cuda(
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0);
template <bool forward, typename T>
static void rope_vision_cuda(const T * x,
T * dst,
const int ne00,
const int ne01,
const int ne02,
const int s01,
const int s02,
const int s03,
const int s1,
const int s2,
const int s3,
const int n_dims,
const int nr,
const int32_t * pos,
const float freq_scale,
const float freq_base,
const float ext_factor,
const float attn_factor,
const rope_corr_dims corr_dims,
const float * freq_factors,
const mrope_sections sections,
cudaStream_t stream) {
GGML_ASSERT(ne00 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
const int n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(nr, n_blocks_x, 1);
// break down (head_dim, heads, seq) into (CUDA_ROPE_BLOCK_SIZE, x, heads * seq)
// where x ~= ceil(head_dim / CUDA_ROPE_BLOCK_SIZE);
@@ -398,11 +487,11 @@ static void rope_vision_cuda(
if (freq_factors == nullptr) {
rope_vision<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
attn_factor, corr_dims, theta_scale, freq_factors, sections);
} else {
rope_vision<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
attn_factor, corr_dims, theta_scale, freq_factors, sections);
}
}
@@ -445,6 +534,11 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx,
const size_t s01 = src0->nb[1] / ggml_type_size(src0->type);
const size_t s02 = src0->nb[2] / ggml_type_size(src0->type);
const size_t s03 = src0->nb[3] / ggml_type_size(src0->type);
const size_t s1 = dst->nb[1] / ggml_type_size(dst->type);
const size_t s2 = dst->nb[2] / ggml_type_size(dst->type);
const size_t s3 = dst->nb[3] / ggml_type_size(dst->type);
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -495,57 +589,63 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx,
// compute
if (is_neox) {
if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F32) {
rope_neox_cuda<forward, float, float>((const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims,
nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
freq_factors, row_indices, set_rows_stride, stream);
rope_neox_cuda<forward, float, float>((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02,
s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
set_rows_stride, stream);
} else if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F16) {
rope_neox_cuda<forward, float, half>((const float *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims,
nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
freq_factors, row_indices, set_rows_stride, stream);
rope_neox_cuda<forward, float, half>((const float *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02,
s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
set_rows_stride, stream);
} else if (src0->type == GGML_TYPE_F16 && dst_type == GGML_TYPE_F16) {
rope_neox_cuda<forward, half, half>((const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr,
pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
freq_factors, row_indices, set_rows_stride, stream);
rope_neox_cuda<forward, half, half>((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02,
s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
set_rows_stride, stream);
} else {
GGML_ABORT("fatal error");
}
} else if (is_mrope && !is_vision) {
if (src0->type == GGML_TYPE_F32) {
rope_multi_cuda<forward>(
(const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, is_imrope, stream);
rope_multi_cuda<forward>((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1,
s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor,
corr_dims, freq_factors, sections, is_imrope, stream);
} else if (src0->type == GGML_TYPE_F16) {
rope_multi_cuda<forward>(
(const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, is_imrope, stream);
rope_multi_cuda<forward>((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1,
s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor,
corr_dims, freq_factors, sections, is_imrope, stream);
} else {
GGML_ABORT("fatal error");
}
} else if (is_vision) {
if (src0->type == GGML_TYPE_F32) {
rope_vision_cuda<forward>(
(const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
rope_vision_cuda<forward>((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1,
s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor,
corr_dims, freq_factors, sections, stream);
} else if (src0->type == GGML_TYPE_F16) {
rope_vision_cuda<forward>(
(const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
rope_vision_cuda<forward>((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1,
s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor,
corr_dims, freq_factors, sections, stream);
} else {
GGML_ABORT("fatal error");
}
} else {
if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F32) {
rope_norm_cuda<forward, float, float>((const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims,
nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
freq_factors, row_indices, set_rows_stride, stream);
rope_norm_cuda<forward, float, float>((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02,
s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
set_rows_stride, stream);
} else if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F16) {
rope_norm_cuda<forward, float, half>((const float *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims,
nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
freq_factors, row_indices, set_rows_stride, stream);
rope_norm_cuda<forward, float, half>((const float *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02,
s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
set_rows_stride, stream);
} else if (src0->type == GGML_TYPE_F16 && dst_type == GGML_TYPE_F16) {
rope_norm_cuda<forward, half, half>((const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr,
pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
freq_factors, row_indices, set_rows_stride, stream);
rope_norm_cuda<forward, half, half>((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02,
s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
set_rows_stride, stream);
} else {
GGML_ABORT("fatal error");
}

View File

@@ -1392,34 +1392,78 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_v
GGML_UNUSED(op);
}
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin(
ggml_metal_library_t lib,
ggml_op op,
int32_t n_fuse,
bool row) {
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin(ggml_metal_library_t lib, const ggml_tensor * op, int32_t n_fuse) {
char base[256];
char name[256];
const char * op_str = "undefined";
switch (op) {
case GGML_OP_ADD: op_str = "add"; break;
case GGML_OP_SUB: op_str = "sub"; break;
case GGML_OP_MUL: op_str = "mul"; break;
case GGML_OP_DIV: op_str = "div"; break;
int op_num = -1;
switch (op->op) {
case GGML_OP_ADD: op_num = 0; break;
case GGML_OP_SUB: op_num = 1; break;
case GGML_OP_MUL: op_num = 2; break;
case GGML_OP_DIV: op_num = 3; break;
default: GGML_ABORT("fatal error");
};
if (row) {
snprintf(base, 256, "kernel_%s_row_c4_fuse_%d", op_str, n_fuse);
} else {
snprintf(base, 256, "kernel_%s_fuse_%d", op_str, n_fuse);
}
const char * t0_str = ggml_type_name(op->src[0]->type);
const char * t1_str = ggml_type_name(op->src[1]->type);
const char * t_str = ggml_type_name(op->type);
snprintf(name, 256, "%s", base);
const bool is_c4 = (op->src[0]->ne[0] % 4 == 0) && (op->src[1]->ne[0] % 4 == 0);
const bool is_rb = ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) && (ggml_nrows(op->src[1]) == 1) && ggml_nelements(op) < 65536;
snprintf(base, 256, "kernel_bin_fuse_%s_%s_%s%s", t0_str, t1_str, t_str, is_c4 ? "_4" : "");
snprintf(name, 256, "%s_op=%d_nf=%d_rb=%d", base, op_num, n_fuse, is_rb);
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
if (!res.pipeline) {
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
ggml_metal_cv_t cv = ggml_metal_cv_init();
ggml_metal_cv_set_int16(cv, op_num, FC_BIN + 0);
ggml_metal_cv_set_int16(cv, n_fuse, FC_BIN + 1);
ggml_metal_cv_set_bool (cv, is_rb, FC_BIN + 2);
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
ggml_metal_cv_free(cv);
}
res.c4 = is_c4;
res.cnt = is_rb;
return res;
}
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin_one(ggml_metal_library_t lib, ggml_op op) {
char base[256];
char name[256];
int op_num = -1;
switch (op) {
case GGML_OP_ADD: op_num = 0; break;
case GGML_OP_SUB: op_num = 1; break;
case GGML_OP_MUL: op_num = 2; break;
case GGML_OP_DIV: op_num = 3; break;
default: GGML_ABORT("fatal error");
};
snprintf(base, 256, "kernel_bin_fuse_%s_%s_%s", "f32", "f32", "f32");
snprintf(name, 256, "%s_op=%d_nf=%d", base, op_num, 1);
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
if (!res.pipeline) {
ggml_metal_cv_t cv = ggml_metal_cv_init();
ggml_metal_cv_set_int16(cv, op_num, FC_BIN + 0);
ggml_metal_cv_set_int16(cv, 1, FC_BIN + 1);
ggml_metal_cv_set_bool (cv, false, FC_BIN + 2);
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
ggml_metal_cv_free(cv);
}
return res;

View File

@@ -53,6 +53,9 @@ struct ggml_metal_pipeline_with_params {
int nr1;
size_t smem;
bool c4;
bool cnt;
};
int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_with_params pipeline);
@@ -134,7 +137,8 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argsort
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argsort_merge (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_top_k (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_top_k_merge (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin (ggml_metal_library_t lib, enum ggml_op op, int32_t n_fuse, bool row);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse );
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin_one (ggml_metal_library_t lib, enum ggml_op op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_l2_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_group_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_norm (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);

View File

@@ -346,10 +346,12 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_meta
struct ggml_metal_pipeline_with_params res = {
/*.pipeline =*/ nil,
/*.nsg =*/ 0,
/*.nr0 =*/ 0,
/*.nr1 =*/ 0,
/*.nsg =*/ 0,
/*.smem =*/ 0,
/*.c4 =*/ false,
/*.cnt =*/ false,
};
res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
@@ -362,10 +364,12 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_meta
struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
struct ggml_metal_pipeline_with_params res = {
/*.pipeline =*/ nil,
/*.nsg =*/ 0,
/*.nr0 =*/ 0,
/*.nr1 =*/ 0,
/*.nsg =*/ 0,
/*.smem =*/ 0,
/*.c4 =*/ false,
/*.cnt =*/ false,
};
[lib->lock lock];
@@ -1054,7 +1058,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_ADD_ID:
return op->src[0]->type == GGML_TYPE_F32;
return ggml_is_contiguous_rows(op->src[0]) && ggml_is_contiguous_rows(op->src[1]) && op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_ACC:
case GGML_OP_REPEAT:
case GGML_OP_SCALE:

View File

@@ -80,6 +80,7 @@
#define FC_SSM_CONV 900
#define FC_SOLVE_TRI 1000
#define FC_COUNT_EQUAL 1100
#define FC_BIN 1200
// op-specific constants
#define OP_FLASH_ATTN_EXT_NQPSG 8

View File

@@ -707,7 +707,7 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) {
/*.o1 =*/ { 0 },
};
auto pipeline = ggml_metal_library_get_pipeline_bin(lib, GGML_OP_ADD, 1, false);
auto pipeline = ggml_metal_library_get_pipeline_bin_one(lib, GGML_OP_ADD);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@@ -2895,8 +2895,6 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
GGML_ASSERT(ggml_is_contiguous_rows(op->src[1]));
bool bcast_row = false;
ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
ggml_metal_buffer_id bid_src1 = ggml_metal_get_buffer_id(op->src[1]);
ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
@@ -2990,18 +2988,7 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
struct ggml_metal_pipeline_with_params pipeline;
if (ggml_nelements(op->src[1]) == ne10 && ggml_is_contiguous(op->src[1]) && ne00 % 4 == 0 && ne10 % 4 == 0) {
GGML_ASSERT(ggml_is_contiguous(op->src[0]));
// src1 is a row
GGML_ASSERT(ne11 == 1);
pipeline = ggml_metal_library_get_pipeline_bin(lib, op->op, n_fuse, true);
bcast_row = true;
} else {
pipeline = ggml_metal_library_get_pipeline_bin(lib, op->op, n_fuse, false);
}
pipeline = ggml_metal_library_get_pipeline_bin(lib, op, n_fuse);
if (n_fuse > 1) {
bid_dst = ggml_metal_get_buffer_id(ctx->node(idx + n_fuse - 1));
@@ -3015,20 +3002,28 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
}
}
if (pipeline.c4) {
args.ne00 = ne00/4;
args.ne10 = ne10/4;
args.ne0 = ne0/4;
}
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
ggml_metal_encoder_set_buffer (enc, bid_src1, 2);
ggml_metal_encoder_set_buffer (enc, bid_dst, 3);
if (bcast_row) {
const int64_t n = ggml_nelements(op)/4;
if (pipeline.cnt) {
const int n = pipeline.c4 ? ggml_nelements(op)/4 : ggml_nelements(op);
ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
} else {
int nth = 32;
const int nth_max = MIN(256, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
while (16*nth < ne0 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
int nth = 1;
while (2*nth < args.ne0 && nth < nth_max) {
nth *= 2;
}

View File

@@ -895,11 +895,13 @@ enum ggml_sort_order {
GGML_SORT_ORDER_DESC,
};
// general-purpose kernel for addition, subtraction, multiplication and division of two tensors
// pros: works for non-contiguous tensors, supports broadcast across all dims
// cons: not very efficient
template <int F>
kernel void kernel_add_fuse_impl(
// OP: 0 - add, 1 - sub, 2 - mul, 3 - div
constant short FC_bin_op [[function_constant(FC_BIN + 0)]];
constant short FC_bin_f [[function_constant(FC_BIN + 1)]];
constant bool FC_bin_rb [[function_constant(FC_BIN + 2)]];
template <typename T0, typename T1, typename T>
kernel void kernel_bin_fuse_impl(
constant ggml_metal_kargs_bin & args,
device const char * src0,
device const char * src1,
@@ -907,138 +909,152 @@ kernel void kernel_add_fuse_impl(
uint3 tgpig[[threadgroup_position_in_grid]],
ushort3 tpitg[[thread_position_in_threadgroup]],
ushort3 ntg[[threads_per_threadgroup]]) {
const int i03 = tgpig.z;
const int i02 = tgpig.y;
const int i01 = tgpig.x;
#define FC_OP FC_bin_op
#define FC_F FC_bin_f
#define FC_RB FC_bin_rb
const int i13 = i03%args.ne13;
const int i12 = i02%args.ne12;
const int i11 = i01%args.ne11;
if (FC_RB) {
// row broadcast
const uint i0 = tgpig.x;
const uint i1 = i0%args.ne10;
device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs);
device float * dst_ptr = (device float *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs);
device const T0 * src0_row = (device const T0 *) (src0);
device T * dst_row = (device T *) (dst);
device const float * src1_ptr[F];
for (short j = 0; j < F; ++j) {
src1_ptr[j] = (device const float *) (src1 + args.o1[j] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
}
if (FC_F == 1) {
device const T1 * src1_row = (device const T1 *) (src1 + args.o1[0]);
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
const int i10 = i0%args.ne10;
if (FC_OP == 0) {
dst_row[i0] = src0_row[i0] + src1_row[i1];
}
float res = src0_ptr[i0];
if (FC_OP == 1) {
dst_row[i0] = src0_row[i0] - src1_row[i1];
}
#pragma unroll
for (short j = 0; j < F; ++j) {
res += src1_ptr[j][i10];
}
if (FC_OP == 2) {
dst_row[i0] = src0_row[i0] * src1_row[i1];
}
dst_ptr[i0] = res;
}
}
if (FC_OP == 3) {
dst_row[i0] = src0_row[i0] / src1_row[i1];
}
} else {
T0 res = src0_row[i0];
typedef decltype(kernel_add_fuse_impl<2>) kernel_add_fuse_t;
if (FC_OP == 0) {
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
res += ((device const T1 *) (src1 + args.o1[j]))[i1];
}
}
template [[host_name("kernel_add_fuse_1")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<1>;
template [[host_name("kernel_add_fuse_2")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<2>;
template [[host_name("kernel_add_fuse_3")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<3>;
template [[host_name("kernel_add_fuse_4")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<4>;
template [[host_name("kernel_add_fuse_5")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<5>;
template [[host_name("kernel_add_fuse_6")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<6>;
template [[host_name("kernel_add_fuse_7")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<7>;
template [[host_name("kernel_add_fuse_8")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<8>;
if (FC_OP == 1) {
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
res -= ((device const T1 *) (src1 + args.o1[j]))[i1];
}
}
kernel void kernel_sub_fuse_1(
constant ggml_metal_kargs_bin & args,
device const char * src0,
device const char * src1,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort3 tpitg[[thread_position_in_threadgroup]],
ushort3 ntg[[threads_per_threadgroup]]) {
const int i03 = tgpig.z;
const int i02 = tgpig.y;
const int i01 = tgpig.x;
if (FC_OP == 2) {
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
res *= ((device const T1 *) (src1 + args.o1[j]))[i1];
}
}
const int i13 = i03%args.ne13;
const int i12 = i02%args.ne12;
const int i11 = i01%args.ne11;
if (FC_OP == 3) {
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
res /= ((device const T1 *) (src1 + args.o1[j]))[i1];
}
}
device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0];
device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs;
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
const int i10 = i0%args.ne10;
*((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) - *((device float *)(src1_ptr + i10*args.nb10));
}
}
kernel void kernel_mul_fuse_1(
constant ggml_metal_kargs_bin & args,
device const char * src0,
device const char * src1,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort3 tpitg[[thread_position_in_threadgroup]],
ushort3 ntg[[threads_per_threadgroup]]) {
const int i03 = tgpig.z;
const int i02 = tgpig.y;
const int i01 = tgpig.x;
const int i13 = i03%args.ne13;
const int i12 = i02%args.ne12;
const int i11 = i01%args.ne11;
device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0];
device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs;
if (args.ne10 == 1) {
const float x = *((device float *)(src1_ptr));
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
*((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * x;
dst_row[i0] = res;
}
} else {
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
const int i10 = i0%args.ne10;
*((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * *((device float *)(src1_ptr + i10*args.nb10));
const int i03 = tgpig.z;
const int i02 = tgpig.y;
const int i01 = tgpig.x;
if (i01 >= args.ne01) {
return;
}
const int i13 = i03%args.ne13;
const int i12 = i02%args.ne12;
const int i11 = i01%args.ne11;
device const T0 * src0_ptr = (device const T0 *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs);
device T * dst_ptr = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs);
if (FC_F == 1) {
device const T1 * src1_ptr = (device const T1 *) (src1 + args.o1[0] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
const int i10 = i0%args.ne10;
if (FC_OP == 0) {
dst_ptr[i0] = src0_ptr[i0] + src1_ptr[i10];
}
if (FC_OP == 1) {
dst_ptr[i0] = src0_ptr[i0] - src1_ptr[i10];
}
if (FC_OP == 2) {
dst_ptr[i0] = src0_ptr[i0] * src1_ptr[i10];
}
if (FC_OP == 3) {
dst_ptr[i0] = src0_ptr[i0] / src1_ptr[i10];
}
}
} else {
device const T1 * src1_ptr[8];
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
src1_ptr[j] = (device const T1 *) (src1 + args.o1[j] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
}
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
const int i10 = i0%args.ne10;
T res = src0_ptr[i0];
if (FC_OP == 0) {
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
res += src1_ptr[j][i10];
}
}
if (FC_OP == 1) {
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
res -= src1_ptr[j][i10];
}
}
if (FC_OP == 2) {
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
res *= src1_ptr[j][i10];
}
}
if (FC_OP == 3) {
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
res /= src1_ptr[j][i10];
}
}
dst_ptr[i0] = res;
}
}
}
#undef FC_OP
#undef FC_F
#undef FC_RB
}
kernel void kernel_div_fuse_1(
constant ggml_metal_kargs_bin & args,
device const char * src0,
device const char * src1,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort3 tpitg[[thread_position_in_threadgroup]],
ushort3 ntg[[threads_per_threadgroup]]) {
const int i03 = tgpig.z;
const int i02 = tgpig.y;
const int i01 = tgpig.x;
typedef decltype(kernel_bin_fuse_impl<float, float, float>) kernel_bin_fuse_t;
const int i13 = i03%args.ne13;
const int i12 = i02%args.ne12;
const int i11 = i01%args.ne11;
device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0];
device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs;
if (args.ne10 == 1) {
const float x = 1.0f / *((device float *)(src1_ptr));
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
*((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * x;
}
} else {
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
const int i10 = i0%args.ne10;
*((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) / *((device float *)(src1_ptr + i10*args.nb10));
}
}
}
template [[host_name("kernel_bin_fuse_f32_f32_f32")]] kernel kernel_bin_fuse_t kernel_bin_fuse_impl<float, float, float>;
template [[host_name("kernel_bin_fuse_f32_f32_f32_4")]] kernel kernel_bin_fuse_t kernel_bin_fuse_impl<float4, float4, float4>;
kernel void kernel_add_id(
constant ggml_metal_kargs_add_id & args,
@@ -1057,7 +1073,7 @@ kernel void kernel_add_id(
const size_t nb1 = args.ne0 * sizeof(float);
const size_t nb2 = args.ne1 * nb1;
device float * dst_row = (device float *)((device char *)dst + i1*nb1 + i2*nb2);
device float * dst_row = (device float *)((device char *)dst + i1*nb1 + i2*nb2);
device const float * src0_row = (device const float *)((device char *)src0 + i1*args.nb01 + i2*args.nb02);
device const float * src1_row = (device const float *)((device char *)src1 + i11*args.nb11);
@@ -1098,141 +1114,6 @@ template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat
template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat<int>;
template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat<short>;
// assumption: src1 is a row
// broadcast src1 into src0
template <short F>
kernel void kernel_add_row_c4_fuse_impl(
constant ggml_metal_kargs_bin & args,
device const char * src0,
device const char * src1,
device char * dst,
uint tpig[[thread_position_in_grid]]) {
const uint nb = args.ne00/4;
const uint i = tpig % nb;
device const float4 * src0_row = (device const float4 *) (src0);
device float4 * dst_row = (device float4 *) (dst);
float4 res = src0_row[tpig];
#pragma unroll(F)
for (short j = 0; j < F; ++j) {
res += ((device const float4 *) (src1 + args.o1[j]))[i];
}
dst_row[tpig] = res;
}
typedef decltype(kernel_add_row_c4_fuse_impl<1>) kernel_add_row_c4_fuse_t;
template [[host_name("kernel_add_row_c4_fuse_1")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<1>;
template [[host_name("kernel_add_row_c4_fuse_2")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<2>;
template [[host_name("kernel_add_row_c4_fuse_3")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<3>;
template [[host_name("kernel_add_row_c4_fuse_4")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<4>;
template [[host_name("kernel_add_row_c4_fuse_5")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<5>;
template [[host_name("kernel_add_row_c4_fuse_6")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<6>;
template [[host_name("kernel_add_row_c4_fuse_7")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<7>;
template [[host_name("kernel_add_row_c4_fuse_8")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<8>;
template <short F>
kernel void kernel_sub_row_c4_fuse_impl(
constant ggml_metal_kargs_bin & args,
device const char * src0,
device const char * src1,
device char * dst,
uint tpig[[thread_position_in_grid]]) {
const uint nb = args.ne00/4;
const uint i = tpig % nb;
device const float4 * src0_row = (device const float4 *) (src0);
device float4 * dst_row = (device float4 *) (dst);
device const float4 * src1_row[F];
for (short j = 0; j < F; ++j) {
src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
}
float4 res = src0_row[tpig];
#pragma unroll(F)
for (short j = 0; j < F; ++j) {
res -= src1_row[j][i];
}
dst_row[tpig] = res;
}
typedef decltype(kernel_sub_row_c4_fuse_impl<1>) kernel_sub_row_c4_fuse_t;
template [[host_name("kernel_sub_row_c4_fuse_1")]] kernel kernel_sub_row_c4_fuse_t kernel_sub_row_c4_fuse_impl<1>;
template <short F>
kernel void kernel_mul_row_c4_fuse_impl(
constant ggml_metal_kargs_bin & args,
device const char * src0,
device const char * src1,
device char * dst,
uint tpig[[thread_position_in_grid]]) {
const uint nb = args.ne00/4;
const uint i = tpig % nb;
device const float4 * src0_row = (device const float4 *) (src0);
device float4 * dst_row = (device float4 *) (dst);
device const float4 * src1_row[F];
for (short j = 0; j < F; ++j) {
src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
}
float4 res = src0_row[tpig];
#pragma unroll(F)
for (short j = 0; j < F; ++j) {
res *= src1_row[j][i];
}
dst_row[tpig] = res;
}
typedef decltype(kernel_mul_row_c4_fuse_impl<1>) kernel_mul_row_c4_fuse_t;
template [[host_name("kernel_mul_row_c4_fuse_1")]] kernel kernel_mul_row_c4_fuse_t kernel_mul_row_c4_fuse_impl<1>;
template <short F>
kernel void kernel_div_row_c4_fuse_impl(
constant ggml_metal_kargs_bin & args,
device const char * src0,
device const char * src1,
device char * dst,
uint tpig[[thread_position_in_grid]]) {
const uint nb = args.ne00/4;
const uint i = tpig % nb;
device const float4 * src0_row = (device const float4 *) (src0);
device float4 * dst_row = (device float4 *) (dst);
device const float4 * src1_row[F];
for (short j = 0; j < F; ++j) {
src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
}
float4 res = src0_row[tpig];
#pragma unroll(F)
for (short j = 0; j < F; ++j) {
res /= src1_row[j][i];
}
dst_row[tpig] = res;
}
typedef decltype(kernel_div_row_c4_fuse_impl<1>) kernel_div_row_c4_fuse_t;
template [[host_name("kernel_div_row_c4_fuse_1")]] kernel kernel_div_row_c4_fuse_t kernel_div_row_c4_fuse_impl<1>;
kernel void kernel_scale_f32(
constant ggml_metal_kargs_scale & args,
device const float * src0,

View File

@@ -11,7 +11,9 @@ function(llama_build source)
add_executable(${TEST_TARGET} ${TEST_SOURCES})
target_link_libraries(${TEST_TARGET} PRIVATE common)
install(TARGETS ${TEST_TARGET} RUNTIME)
if (LLAMA_TESTS_INSTALL)
install(TARGETS ${TEST_TARGET} RUNTIME)
endif()
endfunction()
function(llama_test target)
@@ -100,7 +102,9 @@ function(llama_build_and_test source)
endif()
add_executable(${TEST_TARGET} ${TEST_SOURCES})
install(TARGETS ${TEST_TARGET} RUNTIME)
if (LLAMA_TESTS_INSTALL)
install(TARGETS ${TEST_TARGET} RUNTIME)
endif()
target_link_libraries(${TEST_TARGET} PRIVATE common)
add_test(

View File

@@ -119,27 +119,48 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
[[noreturn]]
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file] [--prune-layers] [--keep-split] [--override-kv]\n");
printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file]\n");
printf(" [--prune-layers] [--keep-split] [--override-kv]\n");
printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n");
printf(" --tensor-type-file tensor_type.txt: list of tensors to quantize to specific ggml_type. example: --tensor-type-file tensor_type_list.txt\n");
printf(" Advanced option to selectively quantize a long list of tensors. Format to be tensor_name=ggml_type, separated by spaces/newline.\n");
printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
printf(" Advanced option to remove all tensors from the given layers\n");
printf(" --keep-split: will generate quantized model in the same shards as input\n");
printf(" --allow-requantize\n");
printf(" allow requantizing tensors that have already been quantized\n");
printf(" WARNING: this can severely reduce quality compared to quantizing\n");
printf(" from 16bit or 32bit!\n");
printf(" --leave-output-tensor\n");
printf(" leave output.weight un(re)quantized\n");
printf(" increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure\n");
printf(" disable k-quant mixtures and quantize all tensors to the same type\n");
printf(" --imatrix file_name\n");
printf(" use data in file_name as importance matrix for quant optimizations\n");
printf(" --include-weights tensor_name\n");
printf(" use importance matrix for this/these tensor(s)\n");
printf(" --exclude-weights tensor_name\n");
printf(" do not use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type\n");
printf(" use this ggml_type for the output.weight tensor\n");
printf(" --token-embedding-type ggml_type\n");
printf(" use this ggml_type for the token embeddings tensor\n");
printf(" --tensor-type tensor_name=ggml_type\n");
printf(" quantize this tensor to this ggml_type\n");
printf(" this is an advanced option to selectively quantize tensors. may be specified multiple times.\n");
printf(" example: --tensor-type attn_q=q8_0\n");
printf(" --tensor-type-file tensor_types.txt\n");
printf(" list of tensors to quantize to a specific ggml_type\n");
printf(" this is an advanced option to selectively quantize a long list of tensors.\n");
printf(" the file should use the same format as above, separated by spaces or newlines.\n");
printf(" --prune-layers L0,L1,L2...\n");
printf(" comma-separated list of layer numbers to prune from the model\n");
printf(" WARNING: this is an advanced option, use with care.\n");
printf(" --keep-split\n");
printf(" generate quantized model in the same shards as input\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
printf("\nAllowed quantization types:\n");
printf(" override model metadata by key in the quantized model. may be specified multiple times.\n");
printf(" WARNING: this is an advanced option, use with care.\n\n");
printf("note: --include-weights and --exclude-weights cannot be used together\n\n");
printf("-----------------------------------------------------------------------------\n");
printf(" allowed quantization types\n");
printf("-----------------------------------------------------------------------------\n\n");
for (const auto & it : QUANT_OPTIONS) {
if (it.name != "COPY") {
printf(" %2d or ", it.ftype);

View File

@@ -1,12 +1,7 @@
#if defined(_MSC_VER)
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif
#include "ggml-rpc.h"
#ifdef _WIN32
# define NOMINMAX
# define DIRECTORY_SEPARATOR '\\'
# include <locale>
# include <windows.h>
# include <fcntl.h>
# include <io.h>
@@ -15,23 +10,43 @@
# include <unistd.h>
# include <sys/stat.h>
#endif
#include <codecvt>
#include <string>
#include <stdio.h>
#include <vector>
#include <filesystem>
#include <algorithm>
#include <thread>
#include <regex>
namespace fs = std::filesystem;
#if defined(__linux__)
#include <sys/types.h>
#include <pwd.h>
#endif
// NOTE: this is copied from common.cpp to avoid linking with libcommon
#ifdef _WIN32
static std::wstring utf8_to_wstring(const std::string & str) {
if (str.empty()) {
return std::wstring();
}
int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
if (size <= 0) {
return std::wstring();
}
std::wstring wstr(size, 0);
MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
return wstr;
}
#endif
// NOTE: this is copied from common.cpp to avoid linking with libcommon
// returns true if successful, false otherwise
static bool fs_create_directory_with_parents(const std::string & path) {
#ifdef _WIN32
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
std::wstring wpath = converter.from_bytes(path);
std::wstring wpath = utf8_to_wstring(path);
// if the path already exists, check whether it's a directory
const DWORD attributes = GetFileAttributesW(wpath.c_str());
@@ -44,9 +59,16 @@ static bool fs_create_directory_with_parents(const std::string & path) {
// process path from front to back, procedurally creating directories
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
const std::wstring subpath = wpath.substr(0, pos_slash);
const wchar_t * test = subpath.c_str();
const bool success = CreateDirectoryW(test, NULL);
pos_slash += 1;
// skip the drive letter, in some systems it can return an access denied error
if (subpath.length() == 2 && subpath[1] == ':') {
continue;
}
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
if (!success) {
const DWORD error = GetLastError();
@@ -60,8 +82,6 @@ static bool fs_create_directory_with_parents(const std::string & path) {
return false;
}
}
pos_slash += 1;
}
return true;
@@ -115,13 +135,27 @@ static std::string fs_get_cache_directory() {
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
if (std::getenv("XDG_CACHE_HOME")) {
cache_directory = std::getenv("XDG_CACHE_HOME");
} else {
} else if (std::getenv("HOME")) {
cache_directory = std::getenv("HOME") + std::string("/.cache/");
} else {
#if defined(__linux__)
/* no $HOME is defined, fallback to getpwuid */
struct passwd *pw = getpwuid(getuid());
if ((!pw) || (!pw->pw_dir)) {
throw std::runtime_error("Failed to find $HOME directory");
}
cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
#else /* defined(__linux__) */
throw std::runtime_error("Failed to find $HOME directory");
#endif /* defined(__linux__) */
}
#elif defined(__APPLE__)
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
#elif defined(_WIN32)
cache_directory = std::getenv("LOCALAPPDATA");
#elif defined(__EMSCRIPTEN__)
GGML_ABORT("not implemented on this platform");
#else
# error Unknown architecture
#endif

View File

@@ -2507,7 +2507,8 @@ private:
slot.n_prompt_tokens_processed++;
// process the last few tokens of the prompt separately in order to allow for a checkpoint to be created.
if (do_checkpoint && slot.task->n_tokens() - slot.prompt.n_tokens() == 64) {
const int n_last = std::min(n_batch, 512);
if (do_checkpoint && slot.task->n_tokens() == slot.prompt.n_tokens() + n_last) {
break;
}
}