mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-12 14:03:20 +02:00
Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
972f323e73 | ||
|
|
f5e7734ff2 | ||
|
|
1e8924fd65 | ||
|
|
39bf692af1 | ||
|
|
e06088da0f | ||
|
|
5fa1c190d9 | ||
|
|
eb449cdfa4 | ||
|
|
5999b50eb0 | ||
|
|
9a5f57795c | ||
|
|
96441c955e | ||
|
|
8872ad2125 |
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@@ -295,6 +295,7 @@ jobs:
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
@@ -307,6 +308,7 @@ jobs:
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=OFF
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
|
||||
120
.github/workflows/server-webui.yml
vendored
120
.github/workflows/server-webui.yml
vendored
@@ -8,10 +8,6 @@ on:
|
||||
description: 'Commit SHA1 to build'
|
||||
required: false
|
||||
type: string
|
||||
slow_tests:
|
||||
description: 'Run slow tests'
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
@@ -101,119 +97,3 @@ jobs:
|
||||
if: ${{ always() && steps.playwright.conclusion == 'success' }}
|
||||
run: npm run test:e2e
|
||||
working-directory: tools/server/webui
|
||||
|
||||
server-build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
||||
build_type: [RelWithDebInfo]
|
||||
include:
|
||||
- build_type: Release
|
||||
sanitizer: ""
|
||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||
|
||||
steps:
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get -y install \
|
||||
build-essential \
|
||||
xxd \
|
||||
git \
|
||||
cmake \
|
||||
curl \
|
||||
wget \
|
||||
language-pack-en \
|
||||
libssl-dev
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Tests dependencies
|
||||
id: test_dependencies
|
||||
run: |
|
||||
pip install -r tools/server/tests/requirements.txt
|
||||
|
||||
- name: Setup Node.js for WebUI
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "22"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/server/webui/package-lock.json"
|
||||
|
||||
- name: Install WebUI dependencies
|
||||
run: npm ci
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Build WebUI
|
||||
run: npm run build
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
id: cmake_build_no_openmp
|
||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_OPENMP=OFF ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Build (sanitizers)
|
||||
id: cmake_build_sanitizers
|
||||
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Build (sanitizers)
|
||||
id: cmake_build
|
||||
if: ${{ matrix.sanitizer == '' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ matrix.sanitizer == '' }}
|
||||
env:
|
||||
GITHUB_ACTIONS: "true"
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
./tests.sh
|
||||
|
||||
- name: Tests (sanitizers)
|
||||
id: server_integration_tests_sanitizers
|
||||
if: ${{ matrix.sanitizer != '' }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
LLAMA_SANITIZE=1 ./tests.sh
|
||||
|
||||
- name: Slow tests
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
SLOW_TESTS=1 ./tests.sh
|
||||
|
||||
22
.github/workflows/server.yml
vendored
22
.github/workflows/server.yml
vendored
@@ -81,18 +81,14 @@ jobs:
|
||||
-DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
|
||||
-DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
|
||||
-DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
|
||||
cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Tests dependencies
|
||||
id: test_dependencies
|
||||
run: |
|
||||
pip install -r tools/server/tests/requirements.txt
|
||||
pip-install: -r tools/server/tests/requirements.txt
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
@@ -102,6 +98,14 @@ jobs:
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Slow tests
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
export ${{ matrix.extra_args }}
|
||||
SLOW_TESTS=1 pytest -v -x
|
||||
|
||||
server-windows:
|
||||
runs-on: windows-2022
|
||||
|
||||
@@ -124,11 +128,7 @@ jobs:
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Tests dependencies
|
||||
id: test_dependencies
|
||||
run: |
|
||||
pip install -r tools/server/tests/requirements.txt
|
||||
pip-install: -r tools/server/tests/requirements.txt
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
|
||||
@@ -109,6 +109,7 @@ option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
||||
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_HTTPLIB "llama: httplib for downloading functionality" ON)
|
||||
|
||||
@@ -288,6 +288,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
|
||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
||||
| [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |
|
||||
| [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
|
||||
|
||||
## Obtaining and quantizing models
|
||||
|
||||
|
||||
180
docs/backend/VirtGPU.md
Normal file
180
docs/backend/VirtGPU.md
Normal file
@@ -0,0 +1,180 @@
|
||||
# GGML-VirtGPU Backend
|
||||
|
||||
The GGML-VirtGPU backend enables GGML applications to run machine
|
||||
learning computations on host hardware while the application itself
|
||||
runs inside a virtual machine. It uses host-guest shared memory to
|
||||
efficiently share data buffers between the two sides.
|
||||
|
||||
This backend relies on the virtio-gpu, and VirglRenderer API Remoting
|
||||
(APIR) component. The backend is split into two libraries:
|
||||
- a GGML implementation (the "remoting frontend"), running in the
|
||||
guest and interacting with the virtgpu device
|
||||
- a VirglRenderer APIR compatible library (the "remoting backend"),
|
||||
running in the host and interacting with Virglrenderer and an actual
|
||||
GGML device backend.
|
||||
|
||||
## OS support
|
||||
|
||||
| OS | Status | Backend | CI testing | Notes
|
||||
| -------- | ----------------- | ----------- | ----------- | -----
|
||||
| MacOS 14 | Supported | ggml-metal | X | Working when compiled on MacOS 14
|
||||
| MacOS 15 | Supported | ggml-metal | X | Working when compiled on MacOS 14 or MacOS 15
|
||||
| MacOS 26 | Not tested | | |
|
||||
| Linux | Under development | ggml-vulkan | not working | Working locally, CI running into deadlocks
|
||||
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
The GGML-VirtGPU backend consists of three main components:
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
%% Nodes
|
||||
|
||||
subgraph GuestVM ["Guest VM - Frontend"]
|
||||
App([GGML Application<br/>llama.cpp, etc.])
|
||||
|
||||
direction TB
|
||||
Interface[GGML Backend Interface]
|
||||
Comm["GGML-VirtGPU<br/>(hypercalls + shared mem)"]
|
||||
|
||||
App --> Interface
|
||||
Interface --> Comm
|
||||
end
|
||||
|
||||
API[virtio-gpu / virglrenderer API]
|
||||
|
||||
subgraph HostSystem [Host System - Backend]
|
||||
direction TB
|
||||
Dispatcher[GGML-VirtGPU-Backend]
|
||||
BackendLib[GGML Backend library<br/>Metal / Vulkan / CPU / ...]
|
||||
|
||||
Dispatcher --> BackendLib
|
||||
end
|
||||
|
||||
%% Connections
|
||||
Comm --> API
|
||||
API --> HostSystem
|
||||
```
|
||||
|
||||
### Key Components
|
||||
|
||||
1. **Guest-side Frontend** (`ggml-virtgpu/`): Implements the GGML backend interface and forwards operations to the host
|
||||
2. **Host-side Backend** (`ggml-virtgpu/backend/`): Receives forwarded operations and executes them on actual hardware backends
|
||||
3. **Communication Layer**: Uses virtio-gpu hypercalls and shared memory for efficient data transfer
|
||||
|
||||
## Features
|
||||
|
||||
- **Dynamic backend loading** on the host side (CPU, CUDA, Metal, etc.)
|
||||
- **Zero-copy data transfer** via host-guest shared memory pages
|
||||
|
||||
## Communication Protocol
|
||||
|
||||
### Hypercalls and Shared Memory
|
||||
|
||||
The backend uses two primary communication mechanisms:
|
||||
|
||||
1. **Hypercalls (`DRM_IOCTL_VIRTGPU_EXECBUFFER`)**: Trigger remote execution from guest to host
|
||||
2. **Shared Memory Pages**: Zero-copy data transfer for tensors and parameters
|
||||
|
||||
#### Shared Memory Layout
|
||||
|
||||
Each connection uses two shared memory buffers:
|
||||
|
||||
- **Data Buffer** (24 MiB): For command/response data and tensor transfers
|
||||
- **Reply Buffer** (16 KiB): For command replies and status information
|
||||
- **Data Buffers**: Dynamically allocated host-guest shared buffers
|
||||
served as GGML buffers.
|
||||
|
||||
### APIR Protocol
|
||||
|
||||
The Virglrender API Remoting protocol defines three command types:
|
||||
|
||||
- `HANDSHAKE`: Protocol version negotiation and capability discovery
|
||||
- `LOADLIBRARY`: Dynamic loading of backend libraries on the host
|
||||
- `FORWARD`: API function call forwarding
|
||||
|
||||
### Binary Serialization
|
||||
|
||||
Commands and data are serialized using a custom binary protocol with:
|
||||
|
||||
- Fixed-size encoding for basic types
|
||||
- Variable-length arrays with size prefixes
|
||||
- Buffer bounds checking
|
||||
- Error recovery mechanisms
|
||||
|
||||
## Supported Operations
|
||||
|
||||
### Device Operations
|
||||
- Device enumeration and capability queries
|
||||
- Memory information (total/free)
|
||||
- Backend type detection
|
||||
|
||||
### Buffer Operations
|
||||
- Buffer allocation and deallocation
|
||||
- Tensor data transfer (host ↔ guest)
|
||||
- Memory copying and clearing
|
||||
|
||||
### Computation Operations
|
||||
- Graph execution forwarding
|
||||
|
||||
## Build Requirements
|
||||
|
||||
### Guest-side Dependencies
|
||||
- `libdrm` for DRM/virtio-gpu communication
|
||||
- C++20 compatible compiler
|
||||
- CMake 3.14+
|
||||
|
||||
### Host-side Dependencies
|
||||
- virglrenderer with APIR support (pending upstream review)
|
||||
- Target backend libraries (libggml-metal, libggml-vulkan, etc.)
|
||||
|
||||
## Configuration
|
||||
|
||||
### Environment Variables
|
||||
|
||||
- `GGML_VIRTGPU_BACKEND_LIBRARY`: Path to the host-side backend library
|
||||
- `GGML_VIRTGPU_DEBUG`: Enable debug logging
|
||||
|
||||
### Build Options
|
||||
|
||||
- `GGML_VIRTGPU`: Enable the VirtGPU backend (`ON` or `OFF`, default: `OFF`)
|
||||
- `GGML_VIRTGPU_BACKEND`: Build the host-side backend component (`ON`, `OFF` or `ONLY`, default: `OFF`)
|
||||
|
||||
### System Requirements
|
||||
|
||||
- VM with virtio-gpu support
|
||||
- VirglRenderer with APIR patches
|
||||
- Compatible backend libraries on host
|
||||
|
||||
## Limitations
|
||||
|
||||
- **VM-specific**: Only works in virtual machines with virtio-gpu support
|
||||
- **Host dependency**: Requires properly configured host-side backend
|
||||
- **Latency**: Small overhead from VM escaping for each operation
|
||||
|
||||
|
||||
* This work is pending upstream changes in the VirglRenderer
|
||||
project.
|
||||
* The backend can be tested with Virglrenderer compiled from source
|
||||
using this PR:
|
||||
https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590
|
||||
* This work is pending changes in the VMM/hypervisor running the
|
||||
virtual machine, which need to know how to route the newly
|
||||
introduced APIR capset.
|
||||
* The environment variable `VIRGL_ROUTE_VENUS_TO_APIR=1` allows
|
||||
using the Venus capset, until the relevant hypervisors have been
|
||||
patched. However, setting this flag breaks the Vulkan/Venus normal
|
||||
behavior.
|
||||
* The environment variable `GGML_REMOTING_USE_APIR_CAPSET` tells the
|
||||
`ggml-virtgpu` backend to use the APIR capset. This will become
|
||||
the default when the relevant hypervisors have been patched.
|
||||
|
||||
* This work focused on improving the performance of llama.cpp running
|
||||
on MacOS containers, and is mainly tested on this platform. The
|
||||
linux support (via `krun`) is in progress.
|
||||
|
||||
## See Also
|
||||
|
||||
- [Development and Testing](VirtGPU/development.md)
|
||||
- [Backend configuration](VirtGPU/configuration.md)
|
||||
174
docs/backend/VirtGPU/configuration.md
Normal file
174
docs/backend/VirtGPU/configuration.md
Normal file
@@ -0,0 +1,174 @@
|
||||
# GGML-VirtGPU Backend Configuration
|
||||
|
||||
This document describes the environment variables used by the ggml-virtgpu backend system, covering both the frontend (guest-side) and backend (host-side) components.
|
||||
|
||||
## Environment Variables Overview
|
||||
|
||||
The ggml-virtgpu backend uses environment variables for configuration across three main components:
|
||||
- **Frontend (Guest)**: GGML applications running in VMs
|
||||
- **Hypervisor**: Virglrenderer/APIR system
|
||||
- **Backend (Host)**: Host-side GGML backend integration
|
||||
|
||||
## Frontend (Guest-side) Configuration
|
||||
|
||||
### GGML_REMOTING_USE_APIR_CAPSET
|
||||
- **Location**: `ggml/src/ggml-virtgpu/virtgpu.cpp`
|
||||
- **Type**: Boolean flag (presence-based)
|
||||
- **Purpose**: Controls which virtio-gpu capability set to use for communication
|
||||
- **Values**:
|
||||
- Set (any value): Use the APIR capset (long-term setup)
|
||||
- Unset: Use the Venus capset (easier for testing with an unmodified hypervisor)
|
||||
- **Default**: Unset (Venus capset)
|
||||
- **Usage**:
|
||||
```bash
|
||||
export GGML_REMOTING_USE_APIR_CAPSET=1 # Use APIR capset
|
||||
# or leave unset for Venus capset
|
||||
```
|
||||
|
||||
## Hypervisor (Virglrenderer/APIR) Configuration
|
||||
|
||||
These environment variables are used during the transition phase for
|
||||
running with an unmodified hypervisor (not supporting the
|
||||
VirglRenderer APIR component). They will be removed in the future, and
|
||||
the hypervisor will instead configure VirglRenderer with the APIR
|
||||
_Configuration Key_.
|
||||
|
||||
### VIRGL_APIR_BACKEND_LIBRARY
|
||||
- **Location**: `virglrenderer/src/apir/apir-context.c`
|
||||
- **Configuration Key**: `apir.load_library.path`
|
||||
- **Type**: File path string
|
||||
- **Purpose**: Path to the APIR backend library that virglrenderer should dynamically load
|
||||
- **Required**: Yes
|
||||
- **Example**:
|
||||
```bash
|
||||
export VIRGL_APIR_BACKEND_LIBRARY="/path/to/libggml-remotingbackend.so"
|
||||
```
|
||||
|
||||
### VIRGL_ROUTE_VENUS_TO_APIR
|
||||
- **Location**: `virglrenderer/src/apir/apir-renderer.h`
|
||||
- **Type**: Boolean flag (presence-based)
|
||||
- **Purpose**: Temporary workaround to route Venus capset calls to APIR during hypervisor transition period
|
||||
- **Status**: will be removed once hypervisors support APIR natively
|
||||
- **Warning**: Breaks normal Vulkan/Venus functionality
|
||||
- **Usage**:
|
||||
```bash
|
||||
export VIRGL_ROUTE_VENUS_TO_APIR=1 # For testing with an unmodified hypervisor
|
||||
```
|
||||
|
||||
### VIRGL_APIR_LOG_TO_FILE
|
||||
- **Location**: `virglrenderer/src/apir/apir-renderer.c`
|
||||
- **Environment Variable**: `VIRGL_APIR_LOG_TO_FILE`
|
||||
- **Type**: File path string
|
||||
- **Purpose**: Enable debug logging from the VirglRenderer APIR component to specified file
|
||||
- **Required**: No (optional debugging)
|
||||
- **Default**: Logging to `stderr`
|
||||
- **Usage**:
|
||||
```bash
|
||||
export VIRGL_APIR_LOG_TO_FILE="/tmp/apir-debug.log"
|
||||
```
|
||||
|
||||
## Backend (Host-side) Configuration
|
||||
|
||||
These environment variables are used during the transition phase for
|
||||
running with an unmodified hypervisor (not supporting the
|
||||
VirglRenderer APIR component). They will be removed in the future, and
|
||||
the hypervisor will instead configure VirglRenderer with the APIR
|
||||
_Configuration Key_.
|
||||
|
||||
### APIR_LLAMA_CPP_GGML_LIBRARY_PATH
|
||||
- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp`
|
||||
- **Environment Variable**: `APIR_LLAMA_CPP_GGML_LIBRARY_PATH`
|
||||
- **Configuration Key**: `ggml.library.path`
|
||||
- **Type**: File path string
|
||||
- **Purpose**: Path to the actual GGML backend library (Metal, CUDA, Vulkan, etc.)
|
||||
- **Required**: **Yes** - backend initialization fails without this
|
||||
- **Examples**:
|
||||
```bash
|
||||
# macOS with Metal backend
|
||||
export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-metal.dylib"
|
||||
|
||||
# Linux with CUDA backend
|
||||
export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-cuda.so"
|
||||
|
||||
# macOS or Linux with Vulkan backend
|
||||
export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-vulkan.so"
|
||||
```
|
||||
|
||||
### APIR_LLAMA_CPP_GGML_LIBRARY_REG
|
||||
- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp`
|
||||
- **Environment Variable**: `APIR_LLAMA_CPP_GGML_LIBRARY_REG`
|
||||
- **Configuration Key**: `ggml.library.reg`
|
||||
- **Type**: Function symbol name string
|
||||
- **Purpose**: Name of the backend registration function to call after loading the library
|
||||
- **Required**: No (defaults to `ggml_backend_init`)
|
||||
- **Default**: `ggml_backend_init`
|
||||
- **Examples**:
|
||||
```bash
|
||||
# Metal backend
|
||||
export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_metal_reg"
|
||||
|
||||
# CUDA backend
|
||||
export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_cuda_reg"
|
||||
|
||||
# Vulkan backend
|
||||
export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_vulkan_reg"
|
||||
|
||||
# Generic fallback (default)
|
||||
# export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_init"
|
||||
```
|
||||
|
||||
### APIR_LLAMA_CPP_LOG_TO_FILE
|
||||
- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp:62`
|
||||
- **Environment Variable**: `APIR_LLAMA_CPP_LOG_TO_FILE`
|
||||
- **Type**: File path string
|
||||
- **Purpose**: Enable debug logging from the GGML backend to specified file
|
||||
- **Required**: No (optional debugging)
|
||||
- **Usage**:
|
||||
```bash
|
||||
export APIR_LLAMA_CPP_LOG_TO_FILE="/tmp/ggml-backend-debug.log"
|
||||
```
|
||||
|
||||
## Configuration Flow
|
||||
|
||||
The configuration system works as follows:
|
||||
|
||||
1. **Hypervisor Setup**: Virglrenderer loads the APIR backend library specified by `VIRGL_APIR_BACKEND_LIBRARY`
|
||||
|
||||
2. **Context Creation**: When an APIR context is created, it populates a configuration table with environment variables:
|
||||
- `apir.load_library.path` ← `VIRGL_APIR_BACKEND_LIBRARY`
|
||||
- `ggml.library.path` ← `APIR_LLAMA_CPP_GGML_LIBRARY_PATH`
|
||||
- `ggml.library.reg` ← `APIR_LLAMA_CPP_GGML_LIBRARY_REG`
|
||||
- this step will eventually be performed by the hypervisor itself, with command-line arguments instead of environment variables.
|
||||
|
||||
3. **Backend Initialization**: The backend queries the configuration via callbacks:
|
||||
- `virgl_cbs->get_config(ctx_id, "ggml.library.path")` returns the library path
|
||||
- `virgl_cbs->get_config(ctx_id, "ggml.library.reg")` returns the registration function
|
||||
|
||||
4. **Library Loading**: The backend dynamically loads and initializes the specified GGML library
|
||||
|
||||
## Error Messages
|
||||
|
||||
Common error scenarios and their messages:
|
||||
|
||||
- **Missing library path**: `"cannot open the GGML library: env var 'APIR_LLAMA_CPP_GGML_LIBRARY_PATH' not defined"`
|
||||
- **Missing registration function**: `"cannot register the GGML library: env var 'APIR_LLAMA_CPP_GGML_LIBRARY_REG' not defined"`
|
||||
|
||||
## Example Complete Configuration
|
||||
|
||||
Here's an example configuration for a macOS host with Metal backend:
|
||||
|
||||
```bash
|
||||
# Hypervisor environment
|
||||
export VIRGL_APIR_BACKEND_LIBRARY="/opt/llama.cpp/lib/libggml-virtgpu-backend.dylib"
|
||||
|
||||
# Backend configuration
|
||||
export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-metal.dylib"
|
||||
export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_metal_reg"
|
||||
|
||||
# Optional logging
|
||||
export VIRGL_APIR_LOG_TO_FILE="/tmp/apir.log"
|
||||
export APIR_LLAMA_CPP_LOG_TO_FILE="/tmp/ggml.log"
|
||||
|
||||
# Guest configuration
|
||||
export GGML_REMOTING_USE_APIR_CAPSET=1
|
||||
```
|
||||
220
docs/backend/VirtGPU/development.md
Normal file
220
docs/backend/VirtGPU/development.md
Normal file
@@ -0,0 +1,220 @@
|
||||
# Development and Testing
|
||||
|
||||
## Development
|
||||
|
||||
### Code Generation
|
||||
|
||||
The backend uses code generation from YAML configuration:
|
||||
|
||||
```bash
|
||||
# Regenerate protocol code
|
||||
cd ggml-virtgpu/
|
||||
python regenerate_remoting.py
|
||||
```
|
||||
|
||||
### Adding New Operations
|
||||
|
||||
1. Add function definition to `ggmlremoting_functions.yaml`
|
||||
2. Regenerate code with `regenerate_remoting.py`
|
||||
3. Implement guest-side forwarding in `virtgpu-forward-*.cpp`
|
||||
4. Implement host-side handling in `backend-dispatched-*.cpp`
|
||||
|
||||
## Testing
|
||||
|
||||
This document provides instructions for building and testing the GGML-VirtGPU backend on macOS with containers.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
The testing setup requires:
|
||||
|
||||
- macOS host system
|
||||
- Container runtime with `libkrun` provider (podman machine)
|
||||
- Access to development patchset for VirglRenderer
|
||||
|
||||
### Required Patchsets
|
||||
|
||||
The backend requires patches that are currently under review:
|
||||
|
||||
- **Virglrenderer APIR upstream PR**: https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590 (for reference)
|
||||
- **MacOS Virglrenderer (for krunkit)**: https://gitlab.freedesktop.org/kpouget/virglrenderer/-/tree/main-macos
|
||||
- **Linux Virglrenderer (for krun)**: https://gitlab.freedesktop.org/kpouget/virglrenderer/-/tree/main-linux
|
||||
|
||||
### Build Instructions
|
||||
|
||||
#### 1. Build ggml-virtgpu-backend (Host-side, macOS)
|
||||
|
||||
```bash
|
||||
# Build the backend that runs natively on macOS
|
||||
mkdir llama.cpp
|
||||
cd llama.cpp
|
||||
git clone https://github.com/ggml-org/llama.cpp.git src
|
||||
cd src
|
||||
|
||||
LLAMA_MAC_BUILD=$PWD/build/ggml-virtgpu-backend
|
||||
|
||||
cmake -S . -B $LLAMA_MAC_BUILD \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DGGML_REMOTINGBACKEND=ONLY \
|
||||
-DGGML_METAL=ON
|
||||
|
||||
TARGETS="ggml-metal"
|
||||
cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $TARGETS
|
||||
|
||||
# Build additional tools for native benchmarking
|
||||
EXTRA_TARGETS="llama-run llama-bench"
|
||||
cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $EXTRA_TARGETS
|
||||
```
|
||||
|
||||
#### 2. Build virglrenderer (Host-side, macOS)
|
||||
|
||||
```bash
|
||||
# Build virglrenderer with APIR support
|
||||
mkdir virglrenderer
|
||||
git clone https://gitlab.freedesktop.org/kpouget/virglrenderer -b main-macos src
|
||||
cd src
|
||||
|
||||
VIRGL_BUILD_DIR=$PWD/build
|
||||
|
||||
# -Dvenus=true and VIRGL_ROUTE_VENUS_TO_APIR=1 route the APIR requests via the Venus backend, for easier testing without a patched hypervisor
|
||||
|
||||
meson setup $VIRGL_BUILD_DIR \
|
||||
-Dvenus=true \
|
||||
-Dapir=true
|
||||
|
||||
ninja -C $VIRGL_BUILD_DIR
|
||||
```
|
||||
|
||||
#### 3. Build ggml-virtgpu (Guest-side, Linux)
|
||||
|
||||
Option A: Build from a script:
|
||||
|
||||
```bash
|
||||
# Inside a Linux container
|
||||
mkdir llama.cpp
|
||||
git clone https://github.com/ggml-org/llama.cpp.git src
|
||||
cd src
|
||||
|
||||
LLAMA_LINUX_BUILD=$PWD//build-virtgpu
|
||||
|
||||
cmake -S . -B $LLAMA_LINUX_BUILD \
|
||||
-DGGML_VIRTGPU=ON
|
||||
|
||||
ninja -C $LLAMA_LINUX_BUILD
|
||||
```
|
||||
|
||||
Option B: Build container image with frontend:
|
||||
|
||||
```bash
|
||||
cat << EOF > remoting.containerfile
|
||||
FROM quay.io/fedora/fedora:43
|
||||
USER 0
|
||||
|
||||
WORKDIR /app/remoting
|
||||
|
||||
ARG LLAMA_CPP_REPO="https://github.com/ggml-org/llama.cpp.git"
|
||||
ARG LLAMA_CPP_VERSION="master"
|
||||
ARG LLAMA_CPP_CMAKE_FLAGS="-DGGML_VIRTGPU=ON"
|
||||
ARG LLAMA_CPP_CMAKE_BUILD_FLAGS="--parallel 4"
|
||||
|
||||
RUN dnf install -y git cmake gcc gcc-c++ libcurl-devel libdrm-devel
|
||||
|
||||
RUN git clone "\${LLAMA_CPP_REPO}" src \\
|
||||
&& git -C src fetch origin \${LLAMA_CPP_VERSION} \\
|
||||
&& git -C src reset --hard FETCH_HEAD
|
||||
|
||||
RUN mkdir -p build \\
|
||||
&& cd src \\
|
||||
&& set -o pipefail \\
|
||||
&& cmake -S . -B ../build \${LLAMA_CPP_CMAKE_FLAGS} \\
|
||||
&& cmake --build ../build/ \${LLAMA_CPP_CMAKE_BUILD_FLAGS}
|
||||
|
||||
ENTRYPOINT ["/app/remoting/src/build/bin/llama-server"]
|
||||
EOF
|
||||
|
||||
mkdir -p empty_dir
|
||||
podman build -f remoting.containerfile ./empty_dir -t localhost/llama-cpp.virtgpu
|
||||
```
|
||||
|
||||
### Environment Setup
|
||||
|
||||
#### Set krunkit Environment Variables
|
||||
|
||||
```bash
|
||||
# Define the base directories (adapt these paths to your system)
|
||||
VIRGL_BUILD_DIR=$HOME/remoting/virglrenderer/build
|
||||
LLAMA_MAC_BUILD=$HOME/remoting/llama.cpp/build-backend
|
||||
|
||||
# For krunkit to load the custom virglrenderer library
|
||||
export DYLD_LIBRARY_PATH=$VIRGL_BUILD_DIR/src
|
||||
|
||||
# For Virglrenderer to load the ggml-remotingbackend library
|
||||
export VIRGL_APIR_BACKEND_LIBRARY="$LLAMA_MAC_BUILD/bin/libggml-virtgpu-backend.dylib"
|
||||
|
||||
# For llama.cpp remotingbackend to load the ggml-metal backend
|
||||
export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="$LLAMA_MAC_BUILD/bin/libggml-metal.dylib"
|
||||
export APIR_LLAMA_CPP_GGML_LIBRARY_REG=ggml_backend_metal_reg
|
||||
```
|
||||
|
||||
#### Launch Container Environment
|
||||
|
||||
```bash
|
||||
# Set container provider to libkrun
|
||||
export CONTAINERS_MACHINE_PROVIDER=libkrun
|
||||
podman machine start
|
||||
```
|
||||
|
||||
#### Verify Environment
|
||||
|
||||
Confirm that krunkit is using the correct virglrenderer library:
|
||||
|
||||
```bash
|
||||
lsof -c krunkit | grep virglrenderer
|
||||
# Expected output:
|
||||
# krunkit 50574 user txt REG 1,14 2273912 10849442 ($VIRGL_BUILD_DIR/src)/libvirglrenderer.1.dylib
|
||||
```
|
||||
|
||||
### Running Tests
|
||||
|
||||
#### Launch Test Container
|
||||
|
||||
```bash
|
||||
# Optional model caching
|
||||
mkdir -p models
|
||||
PODMAN_CACHE_ARGS="-v models:/models --user root:root --cgroupns host --security-opt label=disable -w /models"
|
||||
|
||||
podman run $PODMAN_CACHE_ARGS -it --rm --device /dev/dri localhost/llama-cpp.virtgpu
|
||||
```
|
||||
|
||||
#### Test llama.cpp in Container
|
||||
|
||||
```bash
|
||||
|
||||
# Run performance benchmark
|
||||
/app/remoting/build/bin/llama-bench -m ./llama3.2
|
||||
```
|
||||
|
||||
Expected output (performance may vary):
|
||||
```
|
||||
| model | size | params | backend | ngl | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | ------------: | -------------------: |
|
||||
| llama 3B Q4_K - Medium | 1.87 GiB | 3.21 B | ggml-virtgpu | 99 | pp512 | 991.30 ± 0.66 |
|
||||
| llama 3B Q4_K - Medium | 1.87 GiB | 3.21 B | ggml-virtgpu | 99 | tg128 | 85.71 ± 0.11 |
|
||||
```
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
#### SSH Environment Variable Issues
|
||||
|
||||
⚠️ **Warning**: Setting `DYLD_LIBRARY_PATH` from SSH doesn't work on macOS. Here is a workaround:
|
||||
|
||||
**Workaround 1: Replace system library**
|
||||
```bash
|
||||
VIRGL_BUILD_DIR=$HOME/remoting/virglrenderer/build # ⚠️ adapt to your system
|
||||
BREW_VIRGL_DIR=/opt/homebrew/Cellar/virglrenderer/0.10.4d/lib
|
||||
VIRGL_LIB=libvirglrenderer.1.dylib
|
||||
|
||||
cd $BREW_VIRGL_DIR
|
||||
mv $VIRGL_LIB ${VIRGL_LIB}.orig
|
||||
ln -s $VIRGL_BUILD_DIR/src/$VIRGL_LIB
|
||||
```
|
||||
@@ -43,10 +43,15 @@ static __device__ void rope_yarn(
|
||||
template <bool forward, bool has_ff, typename T, typename D>
|
||||
static __global__ void rope_norm(const T * x,
|
||||
D * dst,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int n_dims,
|
||||
const int32_t * pos,
|
||||
const float freq_scale,
|
||||
@@ -59,23 +64,23 @@ static __global__ void rope_norm(const T * x,
|
||||
const int set_rows_stride) {
|
||||
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
|
||||
if (i0 >= ne0) {
|
||||
if (i0 >= ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
const int row_x = row_dst % ne1;
|
||||
const int channel_x = row_dst / ne1;
|
||||
|
||||
int idst = row_dst * ne0 + i0;
|
||||
const int ix = channel_x*s2 + row_x*s1 + i0;
|
||||
const uint32_t i3 = row_dst / (ne01 * ne02);
|
||||
const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01;
|
||||
const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01;
|
||||
|
||||
int idst = i0 + i1 * s1 + i2 * s2 + i3 * s3;
|
||||
const int ix = i0 + i1 * s01 + i2 * s02 + i3 * s03;
|
||||
// Fusion optimization: ROPE + VIEW + SET_ROWS.
|
||||
// The rope output is viewed as a 1D tensor and offset based on a row index in row_indices.
|
||||
if (set_rows_stride != 0) {
|
||||
idst = row_x * ne0 + i0;
|
||||
idst += row_indices[channel_x] * set_rows_stride;
|
||||
idst = i1 * s1 + i0;
|
||||
idst += row_indices[i2] * set_rows_stride;
|
||||
}
|
||||
|
||||
const auto & store_coaelsced = [&](float x0, float x1) {
|
||||
@@ -92,7 +97,7 @@ static __global__ void rope_norm(const T * x,
|
||||
return;
|
||||
}
|
||||
|
||||
const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
|
||||
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||
|
||||
@@ -110,10 +115,15 @@ static __global__ void rope_norm(const T * x,
|
||||
template <bool forward, bool has_ff, typename T, typename D>
|
||||
static __global__ void rope_neox(const T * x,
|
||||
D * dst,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int n_dims,
|
||||
const int32_t * pos,
|
||||
const float freq_scale,
|
||||
@@ -126,23 +136,24 @@ static __global__ void rope_neox(const T * x,
|
||||
const int set_rows_stride) {
|
||||
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
|
||||
if (i0 >= ne0) {
|
||||
if (i0 >= ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
const int row_x = row_dst % ne1;
|
||||
const int channel_x = row_dst / ne1;
|
||||
const uint32_t i3 = row_dst / (ne01 * ne02);
|
||||
const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01;
|
||||
const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01;
|
||||
|
||||
int idst = row_dst * ne0 + i0 / 2;
|
||||
const int ix = channel_x*s2 + row_x*s1 + i0/2;
|
||||
int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3;
|
||||
const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03;
|
||||
|
||||
// Fusion optimization: ROPE + VIEW + SET_ROWS.
|
||||
// The rope output is viewed as a 1D tensor and offset based on a row index in row_indices.
|
||||
if (set_rows_stride != 0) {
|
||||
idst = row_x * ne0 + i0 / 2;
|
||||
idst += row_indices[channel_x] * set_rows_stride;
|
||||
idst = i1 * s1 + i0 / 2;
|
||||
idst += row_indices[i2] * set_rows_stride;
|
||||
}
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
@@ -152,7 +163,7 @@ static __global__ void rope_neox(const T * x,
|
||||
return;
|
||||
}
|
||||
|
||||
const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
|
||||
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||
|
||||
@@ -168,24 +179,42 @@ static __global__ void rope_neox(const T * x,
|
||||
dst[idst + n_dims / 2] = ggml_cuda_cast<D>(x0 * sin_theta + x1 * cos_theta);
|
||||
}
|
||||
|
||||
template<bool forward, bool has_ff, typename T>
|
||||
static __global__ void rope_multi(
|
||||
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
|
||||
const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections, const bool is_imrope) {
|
||||
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
template <bool forward, bool has_ff, typename T>
|
||||
static __global__ void rope_multi(const T * x,
|
||||
T * dst,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int n_dims,
|
||||
const int32_t * pos,
|
||||
const float freq_scale,
|
||||
const float ext_factor,
|
||||
const float attn_factor,
|
||||
const rope_corr_dims corr_dims,
|
||||
const float theta_scale,
|
||||
const float * freq_factors,
|
||||
const mrope_sections sections,
|
||||
const bool is_imrope) {
|
||||
const int i0 = 2 * (blockDim.y * blockIdx.y + threadIdx.y);
|
||||
|
||||
if (i0 >= ne0) {
|
||||
if (i0 >= ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
const int row_x = row_dst % ne1;
|
||||
const int channel_x = row_dst / ne1;
|
||||
const uint32_t i3 = row_dst / (ne01 * ne02);
|
||||
const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01;
|
||||
const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01;
|
||||
|
||||
const int idst = row_dst*ne0 + i0/2;
|
||||
const int ix = channel_x*s2 + row_x*s1 + i0/2;
|
||||
int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3;
|
||||
const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03;
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
dst[idst + i0/2 + 0] = x[ix + i0/2 + 0];
|
||||
@@ -200,27 +229,24 @@ static __global__ void rope_multi(
|
||||
|
||||
float theta_base = 0.0;
|
||||
if (is_imrope) {
|
||||
if (sector % 3 == 1 && sector < 3 * sections.v[1]) { // h
|
||||
theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
|
||||
} else if (sector % 3 == 2 && sector < 3 * sections.v[2]) { // w
|
||||
theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
|
||||
} else if (sector % 3 == 0 && sector < 3 * sections.v[0]) { // t
|
||||
theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
|
||||
if (sector % 3 == 1 && sector < 3 * sections.v[1]) { // h
|
||||
theta_base = pos[i2 + ne02 * 1] * powf(theta_scale, i0 / 2.0f);
|
||||
} else if (sector % 3 == 2 && sector < 3 * sections.v[2]) { // w
|
||||
theta_base = pos[i2 + ne02 * 2] * powf(theta_scale, i0 / 2.0f);
|
||||
} else if (sector % 3 == 0 && sector < 3 * sections.v[0]) { // t
|
||||
theta_base = pos[i2] * powf(theta_scale, i0 / 2.0f);
|
||||
} else {
|
||||
theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
|
||||
theta_base = pos[i2 + ne02 * 3] * powf(theta_scale, i0 / 2.0f);
|
||||
}
|
||||
} else {
|
||||
if (sector < sections.v[0]) {
|
||||
theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
|
||||
}
|
||||
else if (sector >= sections.v[0] && sector < sec_w) {
|
||||
theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
|
||||
}
|
||||
else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
|
||||
theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
|
||||
}
|
||||
else if (sector >= sec_w + sections.v[2]) {
|
||||
theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
|
||||
theta_base = pos[i2] * powf(theta_scale, i0 / 2.0f);
|
||||
} else if (sector >= sections.v[0] && sector < sec_w) {
|
||||
theta_base = pos[i2 + ne02 * 1] * powf(theta_scale, i0 / 2.0f);
|
||||
} else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
|
||||
theta_base = pos[i2 + ne02 * 2] * powf(theta_scale, i0 / 2.0f);
|
||||
} else if (sector >= sec_w + sections.v[2]) {
|
||||
theta_base = pos[i2 + ne02 * 3] * powf(theta_scale, i0 / 2.0f);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -238,37 +264,53 @@ static __global__ void rope_multi(
|
||||
dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
template<bool forward, bool has_ff, typename T>
|
||||
static __global__ void rope_vision(
|
||||
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims,
|
||||
const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
|
||||
const float theta_scale, const float * freq_factors, const mrope_sections sections) {
|
||||
template <bool forward, bool has_ff, typename T>
|
||||
static __global__ void rope_vision(const T * x,
|
||||
T * dst,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int n_dims,
|
||||
const int32_t * pos,
|
||||
const float freq_scale,
|
||||
const float ext_factor,
|
||||
const float attn_factor,
|
||||
const rope_corr_dims corr_dims,
|
||||
const float theta_scale,
|
||||
const float * freq_factors,
|
||||
const mrope_sections sections) {
|
||||
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
|
||||
if (i0 >= ne0) {
|
||||
if (i0 >= ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
const int row_x = row_dst % ne1;
|
||||
const int channel_x = row_dst / ne1;
|
||||
const uint32_t i3 = row_dst / (ne01 * ne02);
|
||||
const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01;
|
||||
const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01;
|
||||
|
||||
const int idst = row_dst*ne0 + i0/2;
|
||||
const int ix = channel_x*s2 + row_x*s1 + i0/2;
|
||||
int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3;
|
||||
const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03;
|
||||
|
||||
const int sect_dims = sections.v[0] + sections.v[1];
|
||||
const int sec_w = sections.v[1] + sections.v[0];
|
||||
const int sector = (i0 / 2) % sect_dims;
|
||||
const int sec_w = sections.v[1] + sections.v[0];
|
||||
const int sector = (i0 / 2) % sect_dims;
|
||||
|
||||
float theta_base = 0.0;
|
||||
if (sector < sections.v[0]) {
|
||||
const int p = sector;
|
||||
theta_base = pos[channel_x]*powf(theta_scale, p);
|
||||
}
|
||||
else if (sector >= sections.v[0] && sector < sec_w) {
|
||||
theta_base = pos[i2] * powf(theta_scale, p);
|
||||
} else if (sector >= sections.v[0] && sector < sec_w) {
|
||||
const int p = sector - sections.v[0];
|
||||
theta_base = pos[channel_x + ne2]*powf(theta_scale, p);
|
||||
theta_base = pos[i2 + ne02] * powf(theta_scale, p);
|
||||
}
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||
@@ -288,10 +330,15 @@ static __global__ void rope_vision(
|
||||
template <bool forward, typename T, typename D>
|
||||
static void rope_norm_cuda(const T * x,
|
||||
D * dst,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int n_dims,
|
||||
const int nr,
|
||||
const int32_t * pos,
|
||||
@@ -304,31 +351,36 @@ static void rope_norm_cuda(const T * x,
|
||||
const int64_t * row_indices,
|
||||
const int set_rows_stride,
|
||||
cudaStream_t stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
GGML_ASSERT(ne00 % 2 == 0);
|
||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
const int n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE);
|
||||
const dim3 block_nums(nr, n_blocks_x, 1);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
if (freq_factors == nullptr) {
|
||||
rope_norm<forward, false><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
|
||||
freq_factors, row_indices, set_rows_stride);
|
||||
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
|
||||
} else {
|
||||
rope_norm<forward, true><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
|
||||
freq_factors, row_indices, set_rows_stride);
|
||||
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool forward, typename T, typename D>
|
||||
static void rope_neox_cuda(const T * x,
|
||||
D * dst,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int n_dims,
|
||||
const int nr,
|
||||
const int32_t * pos,
|
||||
@@ -341,55 +393,92 @@ static void rope_neox_cuda(const T * x,
|
||||
const int64_t * row_indices,
|
||||
const int set_rows_stride,
|
||||
cudaStream_t stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
GGML_ASSERT(ne00 % 2 == 0);
|
||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
const int n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE);
|
||||
const dim3 block_nums(nr, n_blocks_x, 1);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
if (freq_factors == nullptr) {
|
||||
rope_neox<forward, false><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
|
||||
freq_factors, row_indices, set_rows_stride);
|
||||
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
|
||||
} else {
|
||||
rope_neox<forward, true><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
|
||||
freq_factors, row_indices, set_rows_stride);
|
||||
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
|
||||
}
|
||||
}
|
||||
|
||||
template<bool forward, typename T>
|
||||
static void rope_multi_cuda(
|
||||
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
|
||||
const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, const bool is_imrope, cudaStream_t stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
template <bool forward, typename T>
|
||||
static void rope_multi_cuda(const T * x,
|
||||
T * dst,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int n_dims,
|
||||
const int nr,
|
||||
const int32_t * pos,
|
||||
const float freq_scale,
|
||||
const float freq_base,
|
||||
const float ext_factor,
|
||||
const float attn_factor,
|
||||
const rope_corr_dims corr_dims,
|
||||
const float * freq_factors,
|
||||
const mrope_sections sections,
|
||||
const bool is_imrope,
|
||||
cudaStream_t stream) {
|
||||
GGML_ASSERT(ne00 % 2 == 0);
|
||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
const int n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE);
|
||||
const dim3 block_nums(nr, n_blocks_x, 1);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
if (freq_factors == nullptr) {
|
||||
rope_multi<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
|
||||
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
|
||||
} else {
|
||||
rope_multi<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
|
||||
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
|
||||
}
|
||||
}
|
||||
|
||||
template<bool forward, typename T>
|
||||
static void rope_vision_cuda(
|
||||
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
|
||||
const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
template <bool forward, typename T>
|
||||
static void rope_vision_cuda(const T * x,
|
||||
T * dst,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int n_dims,
|
||||
const int nr,
|
||||
const int32_t * pos,
|
||||
const float freq_scale,
|
||||
const float freq_base,
|
||||
const float ext_factor,
|
||||
const float attn_factor,
|
||||
const rope_corr_dims corr_dims,
|
||||
const float * freq_factors,
|
||||
const mrope_sections sections,
|
||||
cudaStream_t stream) {
|
||||
GGML_ASSERT(ne00 % 2 == 0);
|
||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
const int n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE);
|
||||
const dim3 block_nums(nr, n_blocks_x, 1);
|
||||
// break down (head_dim, heads, seq) into (CUDA_ROPE_BLOCK_SIZE, x, heads * seq)
|
||||
// where x ~= ceil(head_dim / CUDA_ROPE_BLOCK_SIZE);
|
||||
@@ -398,11 +487,11 @@ static void rope_vision_cuda(
|
||||
|
||||
if (freq_factors == nullptr) {
|
||||
rope_vision<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
|
||||
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, sections);
|
||||
} else {
|
||||
rope_vision<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
|
||||
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, sections);
|
||||
}
|
||||
}
|
||||
@@ -445,6 +534,11 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx,
|
||||
|
||||
const size_t s01 = src0->nb[1] / ggml_type_size(src0->type);
|
||||
const size_t s02 = src0->nb[2] / ggml_type_size(src0->type);
|
||||
const size_t s03 = src0->nb[3] / ggml_type_size(src0->type);
|
||||
|
||||
const size_t s1 = dst->nb[1] / ggml_type_size(dst->type);
|
||||
const size_t s2 = dst->nb[2] / ggml_type_size(dst->type);
|
||||
const size_t s3 = dst->nb[3] / ggml_type_size(dst->type);
|
||||
|
||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
@@ -495,57 +589,63 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx,
|
||||
// compute
|
||||
if (is_neox) {
|
||||
if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F32) {
|
||||
rope_neox_cuda<forward, float, float>((const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims,
|
||||
nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
|
||||
freq_factors, row_indices, set_rows_stride, stream);
|
||||
rope_neox_cuda<forward, float, float>((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02,
|
||||
s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
|
||||
ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
|
||||
set_rows_stride, stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F16) {
|
||||
rope_neox_cuda<forward, float, half>((const float *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims,
|
||||
nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
|
||||
freq_factors, row_indices, set_rows_stride, stream);
|
||||
rope_neox_cuda<forward, float, half>((const float *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02,
|
||||
s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
|
||||
ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
|
||||
set_rows_stride, stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst_type == GGML_TYPE_F16) {
|
||||
rope_neox_cuda<forward, half, half>((const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr,
|
||||
pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
|
||||
freq_factors, row_indices, set_rows_stride, stream);
|
||||
rope_neox_cuda<forward, half, half>((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02,
|
||||
s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
|
||||
ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
|
||||
set_rows_stride, stream);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else if (is_mrope && !is_vision) {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
rope_multi_cuda<forward>(
|
||||
(const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
|
||||
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, is_imrope, stream);
|
||||
rope_multi_cuda<forward>((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1,
|
||||
s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor,
|
||||
corr_dims, freq_factors, sections, is_imrope, stream);
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
rope_multi_cuda<forward>(
|
||||
(const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
|
||||
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, is_imrope, stream);
|
||||
rope_multi_cuda<forward>((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1,
|
||||
s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor,
|
||||
corr_dims, freq_factors, sections, is_imrope, stream);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else if (is_vision) {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
rope_vision_cuda<forward>(
|
||||
(const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
|
||||
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
|
||||
rope_vision_cuda<forward>((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1,
|
||||
s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor,
|
||||
corr_dims, freq_factors, sections, stream);
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
rope_vision_cuda<forward>(
|
||||
(const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
|
||||
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
|
||||
rope_vision_cuda<forward>((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1,
|
||||
s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor,
|
||||
corr_dims, freq_factors, sections, stream);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F32) {
|
||||
rope_norm_cuda<forward, float, float>((const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims,
|
||||
nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
|
||||
freq_factors, row_indices, set_rows_stride, stream);
|
||||
rope_norm_cuda<forward, float, float>((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02,
|
||||
s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
|
||||
ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
|
||||
set_rows_stride, stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F16) {
|
||||
rope_norm_cuda<forward, float, half>((const float *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims,
|
||||
nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
|
||||
freq_factors, row_indices, set_rows_stride, stream);
|
||||
rope_norm_cuda<forward, float, half>((const float *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02,
|
||||
s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
|
||||
ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
|
||||
set_rows_stride, stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst_type == GGML_TYPE_F16) {
|
||||
rope_norm_cuda<forward, half, half>((const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr,
|
||||
pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
|
||||
freq_factors, row_indices, set_rows_stride, stream);
|
||||
rope_norm_cuda<forward, half, half>((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02,
|
||||
s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
|
||||
ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
|
||||
set_rows_stride, stream);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
@@ -1392,34 +1392,78 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_v
|
||||
GGML_UNUSED(op);
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin(
|
||||
ggml_metal_library_t lib,
|
||||
ggml_op op,
|
||||
int32_t n_fuse,
|
||||
bool row) {
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin(ggml_metal_library_t lib, const ggml_tensor * op, int32_t n_fuse) {
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
const char * op_str = "undefined";
|
||||
switch (op) {
|
||||
case GGML_OP_ADD: op_str = "add"; break;
|
||||
case GGML_OP_SUB: op_str = "sub"; break;
|
||||
case GGML_OP_MUL: op_str = "mul"; break;
|
||||
case GGML_OP_DIV: op_str = "div"; break;
|
||||
int op_num = -1;
|
||||
|
||||
switch (op->op) {
|
||||
case GGML_OP_ADD: op_num = 0; break;
|
||||
case GGML_OP_SUB: op_num = 1; break;
|
||||
case GGML_OP_MUL: op_num = 2; break;
|
||||
case GGML_OP_DIV: op_num = 3; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
|
||||
if (row) {
|
||||
snprintf(base, 256, "kernel_%s_row_c4_fuse_%d", op_str, n_fuse);
|
||||
} else {
|
||||
snprintf(base, 256, "kernel_%s_fuse_%d", op_str, n_fuse);
|
||||
}
|
||||
const char * t0_str = ggml_type_name(op->src[0]->type);
|
||||
const char * t1_str = ggml_type_name(op->src[1]->type);
|
||||
const char * t_str = ggml_type_name(op->type);
|
||||
|
||||
snprintf(name, 256, "%s", base);
|
||||
const bool is_c4 = (op->src[0]->ne[0] % 4 == 0) && (op->src[1]->ne[0] % 4 == 0);
|
||||
|
||||
const bool is_rb = ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) && (ggml_nrows(op->src[1]) == 1) && ggml_nelements(op) < 65536;
|
||||
|
||||
snprintf(base, 256, "kernel_bin_fuse_%s_%s_%s%s", t0_str, t1_str, t_str, is_c4 ? "_4" : "");
|
||||
snprintf(name, 256, "%s_op=%d_nf=%d_rb=%d", base, op_num, n_fuse, is_rb);
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (!res.pipeline) {
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
|
||||
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||
|
||||
ggml_metal_cv_set_int16(cv, op_num, FC_BIN + 0);
|
||||
ggml_metal_cv_set_int16(cv, n_fuse, FC_BIN + 1);
|
||||
ggml_metal_cv_set_bool (cv, is_rb, FC_BIN + 2);
|
||||
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
||||
|
||||
ggml_metal_cv_free(cv);
|
||||
}
|
||||
|
||||
res.c4 = is_c4;
|
||||
res.cnt = is_rb;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin_one(ggml_metal_library_t lib, ggml_op op) {
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
int op_num = -1;
|
||||
|
||||
switch (op) {
|
||||
case GGML_OP_ADD: op_num = 0; break;
|
||||
case GGML_OP_SUB: op_num = 1; break;
|
||||
case GGML_OP_MUL: op_num = 2; break;
|
||||
case GGML_OP_DIV: op_num = 3; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
|
||||
snprintf(base, 256, "kernel_bin_fuse_%s_%s_%s", "f32", "f32", "f32");
|
||||
snprintf(name, 256, "%s_op=%d_nf=%d", base, op_num, 1);
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (!res.pipeline) {
|
||||
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||
|
||||
ggml_metal_cv_set_int16(cv, op_num, FC_BIN + 0);
|
||||
ggml_metal_cv_set_int16(cv, 1, FC_BIN + 1);
|
||||
ggml_metal_cv_set_bool (cv, false, FC_BIN + 2);
|
||||
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
||||
|
||||
ggml_metal_cv_free(cv);
|
||||
}
|
||||
|
||||
return res;
|
||||
|
||||
@@ -53,6 +53,9 @@ struct ggml_metal_pipeline_with_params {
|
||||
int nr1;
|
||||
|
||||
size_t smem;
|
||||
|
||||
bool c4;
|
||||
bool cnt;
|
||||
};
|
||||
|
||||
int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_with_params pipeline);
|
||||
@@ -134,7 +137,8 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argsort
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argsort_merge (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_top_k (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_top_k_merge (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin (ggml_metal_library_t lib, enum ggml_op op, int32_t n_fuse, bool row);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse );
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin_one (ggml_metal_library_t lib, enum ggml_op op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_l2_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_group_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_norm (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);
|
||||
|
||||
@@ -346,10 +346,12 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_meta
|
||||
|
||||
struct ggml_metal_pipeline_with_params res = {
|
||||
/*.pipeline =*/ nil,
|
||||
/*.nsg =*/ 0,
|
||||
/*.nr0 =*/ 0,
|
||||
/*.nr1 =*/ 0,
|
||||
/*.nsg =*/ 0,
|
||||
/*.smem =*/ 0,
|
||||
/*.c4 =*/ false,
|
||||
/*.cnt =*/ false,
|
||||
};
|
||||
|
||||
res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
|
||||
@@ -362,10 +364,12 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_meta
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
|
||||
struct ggml_metal_pipeline_with_params res = {
|
||||
/*.pipeline =*/ nil,
|
||||
/*.nsg =*/ 0,
|
||||
/*.nr0 =*/ 0,
|
||||
/*.nr1 =*/ 0,
|
||||
/*.nsg =*/ 0,
|
||||
/*.smem =*/ 0,
|
||||
/*.c4 =*/ false,
|
||||
/*.cnt =*/ false,
|
||||
};
|
||||
|
||||
[lib->lock lock];
|
||||
@@ -1054,7 +1058,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_ADD_ID:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
return ggml_is_contiguous_rows(op->src[0]) && ggml_is_contiguous_rows(op->src[1]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_SCALE:
|
||||
|
||||
@@ -80,6 +80,7 @@
|
||||
#define FC_SSM_CONV 900
|
||||
#define FC_SOLVE_TRI 1000
|
||||
#define FC_COUNT_EQUAL 1100
|
||||
#define FC_BIN 1200
|
||||
|
||||
// op-specific constants
|
||||
#define OP_FLASH_ATTN_EXT_NQPSG 8
|
||||
|
||||
@@ -707,7 +707,7 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) {
|
||||
/*.o1 =*/ { 0 },
|
||||
};
|
||||
|
||||
auto pipeline = ggml_metal_library_get_pipeline_bin(lib, GGML_OP_ADD, 1, false);
|
||||
auto pipeline = ggml_metal_library_get_pipeline_bin_one(lib, GGML_OP_ADD);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
@@ -2895,8 +2895,6 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(op->src[1]));
|
||||
|
||||
bool bcast_row = false;
|
||||
|
||||
ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
|
||||
ggml_metal_buffer_id bid_src1 = ggml_metal_get_buffer_id(op->src[1]);
|
||||
ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
|
||||
@@ -2990,18 +2988,7 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
|
||||
|
||||
struct ggml_metal_pipeline_with_params pipeline;
|
||||
|
||||
if (ggml_nelements(op->src[1]) == ne10 && ggml_is_contiguous(op->src[1]) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
||||
GGML_ASSERT(ggml_is_contiguous(op->src[0]));
|
||||
|
||||
// src1 is a row
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
|
||||
pipeline = ggml_metal_library_get_pipeline_bin(lib, op->op, n_fuse, true);
|
||||
|
||||
bcast_row = true;
|
||||
} else {
|
||||
pipeline = ggml_metal_library_get_pipeline_bin(lib, op->op, n_fuse, false);
|
||||
}
|
||||
pipeline = ggml_metal_library_get_pipeline_bin(lib, op, n_fuse);
|
||||
|
||||
if (n_fuse > 1) {
|
||||
bid_dst = ggml_metal_get_buffer_id(ctx->node(idx + n_fuse - 1));
|
||||
@@ -3015,20 +3002,28 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
|
||||
}
|
||||
}
|
||||
|
||||
if (pipeline.c4) {
|
||||
args.ne00 = ne00/4;
|
||||
args.ne10 = ne10/4;
|
||||
args.ne0 = ne0/4;
|
||||
}
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src1, 2);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_dst, 3);
|
||||
|
||||
if (bcast_row) {
|
||||
const int64_t n = ggml_nelements(op)/4;
|
||||
if (pipeline.cnt) {
|
||||
const int n = pipeline.c4 ? ggml_nelements(op)/4 : ggml_nelements(op);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
|
||||
} else {
|
||||
int nth = 32;
|
||||
const int nth_max = MIN(256, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||
|
||||
while (16*nth < ne0 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
||||
int nth = 1;
|
||||
|
||||
while (2*nth < args.ne0 && nth < nth_max) {
|
||||
nth *= 2;
|
||||
}
|
||||
|
||||
|
||||
@@ -895,11 +895,13 @@ enum ggml_sort_order {
|
||||
GGML_SORT_ORDER_DESC,
|
||||
};
|
||||
|
||||
// general-purpose kernel for addition, subtraction, multiplication and division of two tensors
|
||||
// pros: works for non-contiguous tensors, supports broadcast across all dims
|
||||
// cons: not very efficient
|
||||
template <int F>
|
||||
kernel void kernel_add_fuse_impl(
|
||||
// OP: 0 - add, 1 - sub, 2 - mul, 3 - div
|
||||
constant short FC_bin_op [[function_constant(FC_BIN + 0)]];
|
||||
constant short FC_bin_f [[function_constant(FC_BIN + 1)]];
|
||||
constant bool FC_bin_rb [[function_constant(FC_BIN + 2)]];
|
||||
|
||||
template <typename T0, typename T1, typename T>
|
||||
kernel void kernel_bin_fuse_impl(
|
||||
constant ggml_metal_kargs_bin & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
@@ -907,138 +909,152 @@ kernel void kernel_add_fuse_impl(
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int i03 = tgpig.z;
|
||||
const int i02 = tgpig.y;
|
||||
const int i01 = tgpig.x;
|
||||
#define FC_OP FC_bin_op
|
||||
#define FC_F FC_bin_f
|
||||
#define FC_RB FC_bin_rb
|
||||
|
||||
const int i13 = i03%args.ne13;
|
||||
const int i12 = i02%args.ne12;
|
||||
const int i11 = i01%args.ne11;
|
||||
if (FC_RB) {
|
||||
// row broadcast
|
||||
const uint i0 = tgpig.x;
|
||||
const uint i1 = i0%args.ne10;
|
||||
|
||||
device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs);
|
||||
device float * dst_ptr = (device float *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs);
|
||||
device const T0 * src0_row = (device const T0 *) (src0);
|
||||
device T * dst_row = (device T *) (dst);
|
||||
|
||||
device const float * src1_ptr[F];
|
||||
for (short j = 0; j < F; ++j) {
|
||||
src1_ptr[j] = (device const float *) (src1 + args.o1[j] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
|
||||
}
|
||||
if (FC_F == 1) {
|
||||
device const T1 * src1_row = (device const T1 *) (src1 + args.o1[0]);
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const int i10 = i0%args.ne10;
|
||||
if (FC_OP == 0) {
|
||||
dst_row[i0] = src0_row[i0] + src1_row[i1];
|
||||
}
|
||||
|
||||
float res = src0_ptr[i0];
|
||||
if (FC_OP == 1) {
|
||||
dst_row[i0] = src0_row[i0] - src1_row[i1];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (short j = 0; j < F; ++j) {
|
||||
res += src1_ptr[j][i10];
|
||||
}
|
||||
if (FC_OP == 2) {
|
||||
dst_row[i0] = src0_row[i0] * src1_row[i1];
|
||||
}
|
||||
|
||||
dst_ptr[i0] = res;
|
||||
}
|
||||
}
|
||||
if (FC_OP == 3) {
|
||||
dst_row[i0] = src0_row[i0] / src1_row[i1];
|
||||
}
|
||||
} else {
|
||||
T0 res = src0_row[i0];
|
||||
|
||||
typedef decltype(kernel_add_fuse_impl<2>) kernel_add_fuse_t;
|
||||
if (FC_OP == 0) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res += ((device const T1 *) (src1 + args.o1[j]))[i1];
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("kernel_add_fuse_1")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<1>;
|
||||
template [[host_name("kernel_add_fuse_2")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<2>;
|
||||
template [[host_name("kernel_add_fuse_3")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<3>;
|
||||
template [[host_name("kernel_add_fuse_4")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<4>;
|
||||
template [[host_name("kernel_add_fuse_5")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<5>;
|
||||
template [[host_name("kernel_add_fuse_6")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<6>;
|
||||
template [[host_name("kernel_add_fuse_7")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<7>;
|
||||
template [[host_name("kernel_add_fuse_8")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<8>;
|
||||
if (FC_OP == 1) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res -= ((device const T1 *) (src1 + args.o1[j]))[i1];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_sub_fuse_1(
|
||||
constant ggml_metal_kargs_bin & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int i03 = tgpig.z;
|
||||
const int i02 = tgpig.y;
|
||||
const int i01 = tgpig.x;
|
||||
if (FC_OP == 2) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res *= ((device const T1 *) (src1 + args.o1[j]))[i1];
|
||||
}
|
||||
}
|
||||
|
||||
const int i13 = i03%args.ne13;
|
||||
const int i12 = i02%args.ne12;
|
||||
const int i11 = i01%args.ne11;
|
||||
if (FC_OP == 3) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res /= ((device const T1 *) (src1 + args.o1[j]))[i1];
|
||||
}
|
||||
}
|
||||
|
||||
device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
|
||||
device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0];
|
||||
device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const int i10 = i0%args.ne10;
|
||||
*((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) - *((device float *)(src1_ptr + i10*args.nb10));
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_mul_fuse_1(
|
||||
constant ggml_metal_kargs_bin & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int i03 = tgpig.z;
|
||||
const int i02 = tgpig.y;
|
||||
const int i01 = tgpig.x;
|
||||
|
||||
const int i13 = i03%args.ne13;
|
||||
const int i12 = i02%args.ne12;
|
||||
const int i11 = i01%args.ne11;
|
||||
|
||||
device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
|
||||
device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0];
|
||||
device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs;
|
||||
|
||||
if (args.ne10 == 1) {
|
||||
const float x = *((device float *)(src1_ptr));
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
*((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * x;
|
||||
dst_row[i0] = res;
|
||||
}
|
||||
} else {
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const int i10 = i0%args.ne10;
|
||||
*((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * *((device float *)(src1_ptr + i10*args.nb10));
|
||||
const int i03 = tgpig.z;
|
||||
const int i02 = tgpig.y;
|
||||
const int i01 = tgpig.x;
|
||||
|
||||
if (i01 >= args.ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i13 = i03%args.ne13;
|
||||
const int i12 = i02%args.ne12;
|
||||
const int i11 = i01%args.ne11;
|
||||
|
||||
device const T0 * src0_ptr = (device const T0 *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs);
|
||||
device T * dst_ptr = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs);
|
||||
|
||||
if (FC_F == 1) {
|
||||
device const T1 * src1_ptr = (device const T1 *) (src1 + args.o1[0] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const int i10 = i0%args.ne10;
|
||||
|
||||
if (FC_OP == 0) {
|
||||
dst_ptr[i0] = src0_ptr[i0] + src1_ptr[i10];
|
||||
}
|
||||
|
||||
if (FC_OP == 1) {
|
||||
dst_ptr[i0] = src0_ptr[i0] - src1_ptr[i10];
|
||||
}
|
||||
|
||||
if (FC_OP == 2) {
|
||||
dst_ptr[i0] = src0_ptr[i0] * src1_ptr[i10];
|
||||
}
|
||||
|
||||
if (FC_OP == 3) {
|
||||
dst_ptr[i0] = src0_ptr[i0] / src1_ptr[i10];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
device const T1 * src1_ptr[8];
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
src1_ptr[j] = (device const T1 *) (src1 + args.o1[j] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
|
||||
}
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const int i10 = i0%args.ne10;
|
||||
|
||||
T res = src0_ptr[i0];
|
||||
|
||||
if (FC_OP == 0) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res += src1_ptr[j][i10];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_OP == 1) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res -= src1_ptr[j][i10];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_OP == 2) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res *= src1_ptr[j][i10];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_OP == 3) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res /= src1_ptr[j][i10];
|
||||
}
|
||||
}
|
||||
|
||||
dst_ptr[i0] = res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef FC_OP
|
||||
#undef FC_F
|
||||
#undef FC_RB
|
||||
}
|
||||
|
||||
kernel void kernel_div_fuse_1(
|
||||
constant ggml_metal_kargs_bin & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int i03 = tgpig.z;
|
||||
const int i02 = tgpig.y;
|
||||
const int i01 = tgpig.x;
|
||||
typedef decltype(kernel_bin_fuse_impl<float, float, float>) kernel_bin_fuse_t;
|
||||
|
||||
const int i13 = i03%args.ne13;
|
||||
const int i12 = i02%args.ne12;
|
||||
const int i11 = i01%args.ne11;
|
||||
|
||||
device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
|
||||
device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0];
|
||||
device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs;
|
||||
|
||||
if (args.ne10 == 1) {
|
||||
const float x = 1.0f / *((device float *)(src1_ptr));
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
*((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * x;
|
||||
}
|
||||
} else {
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const int i10 = i0%args.ne10;
|
||||
*((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) / *((device float *)(src1_ptr + i10*args.nb10));
|
||||
}
|
||||
}
|
||||
}
|
||||
template [[host_name("kernel_bin_fuse_f32_f32_f32")]] kernel kernel_bin_fuse_t kernel_bin_fuse_impl<float, float, float>;
|
||||
template [[host_name("kernel_bin_fuse_f32_f32_f32_4")]] kernel kernel_bin_fuse_t kernel_bin_fuse_impl<float4, float4, float4>;
|
||||
|
||||
kernel void kernel_add_id(
|
||||
constant ggml_metal_kargs_add_id & args,
|
||||
@@ -1057,7 +1073,7 @@ kernel void kernel_add_id(
|
||||
const size_t nb1 = args.ne0 * sizeof(float);
|
||||
const size_t nb2 = args.ne1 * nb1;
|
||||
|
||||
device float * dst_row = (device float *)((device char *)dst + i1*nb1 + i2*nb2);
|
||||
device float * dst_row = (device float *)((device char *)dst + i1*nb1 + i2*nb2);
|
||||
device const float * src0_row = (device const float *)((device char *)src0 + i1*args.nb01 + i2*args.nb02);
|
||||
device const float * src1_row = (device const float *)((device char *)src1 + i11*args.nb11);
|
||||
|
||||
@@ -1098,141 +1114,6 @@ template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat
|
||||
template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat<int>;
|
||||
template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat<short>;
|
||||
|
||||
// assumption: src1 is a row
|
||||
// broadcast src1 into src0
|
||||
template <short F>
|
||||
kernel void kernel_add_row_c4_fuse_impl(
|
||||
constant ggml_metal_kargs_bin & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
const uint nb = args.ne00/4;
|
||||
const uint i = tpig % nb;
|
||||
|
||||
device const float4 * src0_row = (device const float4 *) (src0);
|
||||
device float4 * dst_row = (device float4 *) (dst);
|
||||
|
||||
float4 res = src0_row[tpig];
|
||||
|
||||
#pragma unroll(F)
|
||||
for (short j = 0; j < F; ++j) {
|
||||
res += ((device const float4 *) (src1 + args.o1[j]))[i];
|
||||
}
|
||||
|
||||
dst_row[tpig] = res;
|
||||
}
|
||||
|
||||
typedef decltype(kernel_add_row_c4_fuse_impl<1>) kernel_add_row_c4_fuse_t;
|
||||
|
||||
template [[host_name("kernel_add_row_c4_fuse_1")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<1>;
|
||||
template [[host_name("kernel_add_row_c4_fuse_2")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<2>;
|
||||
template [[host_name("kernel_add_row_c4_fuse_3")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<3>;
|
||||
template [[host_name("kernel_add_row_c4_fuse_4")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<4>;
|
||||
template [[host_name("kernel_add_row_c4_fuse_5")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<5>;
|
||||
template [[host_name("kernel_add_row_c4_fuse_6")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<6>;
|
||||
template [[host_name("kernel_add_row_c4_fuse_7")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<7>;
|
||||
template [[host_name("kernel_add_row_c4_fuse_8")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<8>;
|
||||
|
||||
template <short F>
|
||||
kernel void kernel_sub_row_c4_fuse_impl(
|
||||
constant ggml_metal_kargs_bin & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
|
||||
const uint nb = args.ne00/4;
|
||||
const uint i = tpig % nb;
|
||||
|
||||
device const float4 * src0_row = (device const float4 *) (src0);
|
||||
device float4 * dst_row = (device float4 *) (dst);
|
||||
|
||||
device const float4 * src1_row[F];
|
||||
for (short j = 0; j < F; ++j) {
|
||||
src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
|
||||
}
|
||||
|
||||
float4 res = src0_row[tpig];
|
||||
|
||||
#pragma unroll(F)
|
||||
for (short j = 0; j < F; ++j) {
|
||||
res -= src1_row[j][i];
|
||||
}
|
||||
|
||||
dst_row[tpig] = res;
|
||||
}
|
||||
|
||||
typedef decltype(kernel_sub_row_c4_fuse_impl<1>) kernel_sub_row_c4_fuse_t;
|
||||
|
||||
template [[host_name("kernel_sub_row_c4_fuse_1")]] kernel kernel_sub_row_c4_fuse_t kernel_sub_row_c4_fuse_impl<1>;
|
||||
|
||||
template <short F>
|
||||
kernel void kernel_mul_row_c4_fuse_impl(
|
||||
constant ggml_metal_kargs_bin & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
|
||||
const uint nb = args.ne00/4;
|
||||
const uint i = tpig % nb;
|
||||
|
||||
device const float4 * src0_row = (device const float4 *) (src0);
|
||||
device float4 * dst_row = (device float4 *) (dst);
|
||||
|
||||
device const float4 * src1_row[F];
|
||||
for (short j = 0; j < F; ++j) {
|
||||
src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
|
||||
}
|
||||
|
||||
float4 res = src0_row[tpig];
|
||||
|
||||
#pragma unroll(F)
|
||||
for (short j = 0; j < F; ++j) {
|
||||
res *= src1_row[j][i];
|
||||
}
|
||||
|
||||
dst_row[tpig] = res;
|
||||
}
|
||||
|
||||
typedef decltype(kernel_mul_row_c4_fuse_impl<1>) kernel_mul_row_c4_fuse_t;
|
||||
|
||||
template [[host_name("kernel_mul_row_c4_fuse_1")]] kernel kernel_mul_row_c4_fuse_t kernel_mul_row_c4_fuse_impl<1>;
|
||||
|
||||
template <short F>
|
||||
kernel void kernel_div_row_c4_fuse_impl(
|
||||
constant ggml_metal_kargs_bin & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
|
||||
const uint nb = args.ne00/4;
|
||||
const uint i = tpig % nb;
|
||||
|
||||
device const float4 * src0_row = (device const float4 *) (src0);
|
||||
device float4 * dst_row = (device float4 *) (dst);
|
||||
|
||||
device const float4 * src1_row[F];
|
||||
for (short j = 0; j < F; ++j) {
|
||||
src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
|
||||
}
|
||||
|
||||
float4 res = src0_row[tpig];
|
||||
|
||||
#pragma unroll(F)
|
||||
for (short j = 0; j < F; ++j) {
|
||||
res /= src1_row[j][i];
|
||||
}
|
||||
|
||||
dst_row[tpig] = res;
|
||||
}
|
||||
|
||||
typedef decltype(kernel_div_row_c4_fuse_impl<1>) kernel_div_row_c4_fuse_t;
|
||||
|
||||
template [[host_name("kernel_div_row_c4_fuse_1")]] kernel kernel_div_row_c4_fuse_t kernel_div_row_c4_fuse_impl<1>;
|
||||
|
||||
kernel void kernel_scale_f32(
|
||||
constant ggml_metal_kargs_scale & args,
|
||||
device const float * src0,
|
||||
|
||||
@@ -11,7 +11,9 @@ function(llama_build source)
|
||||
|
||||
add_executable(${TEST_TARGET} ${TEST_SOURCES})
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE common)
|
||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
if (LLAMA_TESTS_INSTALL)
|
||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(llama_test target)
|
||||
@@ -100,7 +102,9 @@ function(llama_build_and_test source)
|
||||
endif()
|
||||
|
||||
add_executable(${TEST_TARGET} ${TEST_SOURCES})
|
||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
if (LLAMA_TESTS_INSTALL)
|
||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
endif()
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE common)
|
||||
|
||||
add_test(
|
||||
|
||||
@@ -119,27 +119,48 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
||||
[[noreturn]]
|
||||
static void usage(const char * executable) {
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
|
||||
printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file] [--prune-layers] [--keep-split] [--override-kv]\n");
|
||||
printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file]\n");
|
||||
printf(" [--prune-layers] [--keep-split] [--override-kv]\n");
|
||||
printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
|
||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
||||
printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
|
||||
printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
||||
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
||||
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
|
||||
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
|
||||
printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
|
||||
printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n");
|
||||
printf(" --tensor-type-file tensor_type.txt: list of tensors to quantize to specific ggml_type. example: --tensor-type-file tensor_type_list.txt\n");
|
||||
printf(" Advanced option to selectively quantize a long list of tensors. Format to be tensor_name=ggml_type, separated by spaces/newline.\n");
|
||||
printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
|
||||
printf(" Advanced option to remove all tensors from the given layers\n");
|
||||
printf(" --keep-split: will generate quantized model in the same shards as input\n");
|
||||
printf(" --allow-requantize\n");
|
||||
printf(" allow requantizing tensors that have already been quantized\n");
|
||||
printf(" WARNING: this can severely reduce quality compared to quantizing\n");
|
||||
printf(" from 16bit or 32bit!\n");
|
||||
printf(" --leave-output-tensor\n");
|
||||
printf(" leave output.weight un(re)quantized\n");
|
||||
printf(" increases model size but may also increase quality, especially when requantizing\n");
|
||||
printf(" --pure\n");
|
||||
printf(" disable k-quant mixtures and quantize all tensors to the same type\n");
|
||||
printf(" --imatrix file_name\n");
|
||||
printf(" use data in file_name as importance matrix for quant optimizations\n");
|
||||
printf(" --include-weights tensor_name\n");
|
||||
printf(" use importance matrix for this/these tensor(s)\n");
|
||||
printf(" --exclude-weights tensor_name\n");
|
||||
printf(" do not use importance matrix for this/these tensor(s)\n");
|
||||
printf(" --output-tensor-type ggml_type\n");
|
||||
printf(" use this ggml_type for the output.weight tensor\n");
|
||||
printf(" --token-embedding-type ggml_type\n");
|
||||
printf(" use this ggml_type for the token embeddings tensor\n");
|
||||
printf(" --tensor-type tensor_name=ggml_type\n");
|
||||
printf(" quantize this tensor to this ggml_type\n");
|
||||
printf(" this is an advanced option to selectively quantize tensors. may be specified multiple times.\n");
|
||||
printf(" example: --tensor-type attn_q=q8_0\n");
|
||||
printf(" --tensor-type-file tensor_types.txt\n");
|
||||
printf(" list of tensors to quantize to a specific ggml_type\n");
|
||||
printf(" this is an advanced option to selectively quantize a long list of tensors.\n");
|
||||
printf(" the file should use the same format as above, separated by spaces or newlines.\n");
|
||||
printf(" --prune-layers L0,L1,L2...\n");
|
||||
printf(" comma-separated list of layer numbers to prune from the model\n");
|
||||
printf(" WARNING: this is an advanced option, use with care.\n");
|
||||
printf(" --keep-split\n");
|
||||
printf(" generate quantized model in the same shards as input\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
|
||||
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
|
||||
printf("\nAllowed quantization types:\n");
|
||||
printf(" override model metadata by key in the quantized model. may be specified multiple times.\n");
|
||||
printf(" WARNING: this is an advanced option, use with care.\n\n");
|
||||
printf("note: --include-weights and --exclude-weights cannot be used together\n\n");
|
||||
printf("-----------------------------------------------------------------------------\n");
|
||||
printf(" allowed quantization types\n");
|
||||
printf("-----------------------------------------------------------------------------\n\n");
|
||||
for (const auto & it : QUANT_OPTIONS) {
|
||||
if (it.name != "COPY") {
|
||||
printf(" %2d or ", it.ftype);
|
||||
|
||||
@@ -1,12 +1,7 @@
|
||||
#if defined(_MSC_VER)
|
||||
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
||||
#endif
|
||||
|
||||
#include "ggml-rpc.h"
|
||||
#ifdef _WIN32
|
||||
# define NOMINMAX
|
||||
# define DIRECTORY_SEPARATOR '\\'
|
||||
# include <locale>
|
||||
# include <windows.h>
|
||||
# include <fcntl.h>
|
||||
# include <io.h>
|
||||
@@ -15,23 +10,43 @@
|
||||
# include <unistd.h>
|
||||
# include <sys/stat.h>
|
||||
#endif
|
||||
#include <codecvt>
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
#include <filesystem>
|
||||
#include <algorithm>
|
||||
#include <thread>
|
||||
#include <regex>
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
#if defined(__linux__)
|
||||
#include <sys/types.h>
|
||||
#include <pwd.h>
|
||||
#endif
|
||||
|
||||
// NOTE: this is copied from common.cpp to avoid linking with libcommon
|
||||
#ifdef _WIN32
|
||||
static std::wstring utf8_to_wstring(const std::string & str) {
|
||||
if (str.empty()) {
|
||||
return std::wstring();
|
||||
}
|
||||
|
||||
int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
|
||||
|
||||
if (size <= 0) {
|
||||
return std::wstring();
|
||||
}
|
||||
|
||||
std::wstring wstr(size, 0);
|
||||
MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
|
||||
|
||||
return wstr;
|
||||
}
|
||||
#endif
|
||||
|
||||
// NOTE: this is copied from common.cpp to avoid linking with libcommon
|
||||
// returns true if successful, false otherwise
|
||||
static bool fs_create_directory_with_parents(const std::string & path) {
|
||||
#ifdef _WIN32
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
||||
std::wstring wpath = converter.from_bytes(path);
|
||||
std::wstring wpath = utf8_to_wstring(path);
|
||||
|
||||
// if the path already exists, check whether it's a directory
|
||||
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
||||
@@ -44,9 +59,16 @@ static bool fs_create_directory_with_parents(const std::string & path) {
|
||||
// process path from front to back, procedurally creating directories
|
||||
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
||||
const std::wstring subpath = wpath.substr(0, pos_slash);
|
||||
const wchar_t * test = subpath.c_str();
|
||||
|
||||
const bool success = CreateDirectoryW(test, NULL);
|
||||
pos_slash += 1;
|
||||
|
||||
// skip the drive letter, in some systems it can return an access denied error
|
||||
if (subpath.length() == 2 && subpath[1] == ':') {
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
|
||||
|
||||
if (!success) {
|
||||
const DWORD error = GetLastError();
|
||||
|
||||
@@ -60,8 +82,6 @@ static bool fs_create_directory_with_parents(const std::string & path) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
pos_slash += 1;
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -115,13 +135,27 @@ static std::string fs_get_cache_directory() {
|
||||
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
|
||||
if (std::getenv("XDG_CACHE_HOME")) {
|
||||
cache_directory = std::getenv("XDG_CACHE_HOME");
|
||||
} else {
|
||||
} else if (std::getenv("HOME")) {
|
||||
cache_directory = std::getenv("HOME") + std::string("/.cache/");
|
||||
} else {
|
||||
#if defined(__linux__)
|
||||
/* no $HOME is defined, fallback to getpwuid */
|
||||
struct passwd *pw = getpwuid(getuid());
|
||||
if ((!pw) || (!pw->pw_dir)) {
|
||||
throw std::runtime_error("Failed to find $HOME directory");
|
||||
}
|
||||
|
||||
cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
|
||||
#else /* defined(__linux__) */
|
||||
throw std::runtime_error("Failed to find $HOME directory");
|
||||
#endif /* defined(__linux__) */
|
||||
}
|
||||
#elif defined(__APPLE__)
|
||||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||
#elif defined(_WIN32)
|
||||
cache_directory = std::getenv("LOCALAPPDATA");
|
||||
#elif defined(__EMSCRIPTEN__)
|
||||
GGML_ABORT("not implemented on this platform");
|
||||
#else
|
||||
# error Unknown architecture
|
||||
#endif
|
||||
|
||||
@@ -2507,7 +2507,8 @@ private:
|
||||
slot.n_prompt_tokens_processed++;
|
||||
|
||||
// process the last few tokens of the prompt separately in order to allow for a checkpoint to be created.
|
||||
if (do_checkpoint && slot.task->n_tokens() - slot.prompt.n_tokens() == 64) {
|
||||
const int n_last = std::min(n_batch, 512);
|
||||
if (do_checkpoint && slot.task->n_tokens() == slot.prompt.n_tokens() + n_last) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user