mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
20 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8afbd96818 | ||
|
|
8ae5ebcf85 | ||
|
|
3e959f0976 | ||
|
|
36667c8edc | ||
|
|
3bf785f3ef | ||
|
|
1d36b3670b | ||
|
|
b34443923c | ||
|
|
a75cb30dc9 | ||
|
|
3f3769ba76 | ||
|
|
2f567611c0 | ||
|
|
7d2123484e | ||
|
|
074e42ab31 | ||
|
|
c642bc014c | ||
|
|
cb06a3c363 | ||
|
|
626083faf7 | ||
|
|
2af6880178 | ||
|
|
e84773ab60 | ||
|
|
fab647e884 | ||
|
|
dcf886007d | ||
|
|
d24d592808 |
@@ -21,15 +21,15 @@ indent_style = tab
|
||||
[prompts/*.txt]
|
||||
insert_final_newline = unset
|
||||
|
||||
[examples/server/public/*]
|
||||
[tools/server/public/*]
|
||||
indent_size = 2
|
||||
|
||||
[examples/server/public/deps_*]
|
||||
[tools/server/public/deps_*]
|
||||
trim_trailing_whitespace = unset
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
|
||||
[examples/server/deps_*]
|
||||
[tools/server/deps_*]
|
||||
trim_trailing_whitespace = unset
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
@@ -37,7 +37,7 @@ indent_size = unset
|
||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||
indent_style = tab
|
||||
|
||||
[examples/cvector-generator/*.txt]
|
||||
[tools/cvector-generator/*.txt]
|
||||
trim_trailing_whitespace = unset
|
||||
insert_final_newline = unset
|
||||
|
||||
|
||||
3
.flake8
3
.flake8
@@ -2,8 +2,9 @@
|
||||
max-line-length = 125
|
||||
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
|
||||
exclude =
|
||||
# Do not traverse examples
|
||||
# Do not traverse examples and tools
|
||||
examples,
|
||||
tools,
|
||||
# Do not include package initializers
|
||||
__init__.py,
|
||||
# No need to traverse our git directory
|
||||
|
||||
6
.github/labeler.yml
vendored
6
.github/labeler.yml
vendored
@@ -45,7 +45,9 @@ build:
|
||||
- CMakePresets.json
|
||||
examples:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file: examples/**
|
||||
- any-glob-to-any-file:
|
||||
- examples/**
|
||||
- tools/**
|
||||
devops:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
@@ -70,7 +72,7 @@ android:
|
||||
server:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- examples/server/**
|
||||
- tools/server/**
|
||||
ggml:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
||||
30
.github/workflows/bench.yml.disabled
vendored
30
.github/workflows/bench.yml.disabled
vendored
@@ -27,10 +27,10 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
|
||||
pull_request_target:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
|
||||
schedule:
|
||||
- cron: '04 2 * * *'
|
||||
|
||||
@@ -69,7 +69,7 @@ jobs:
|
||||
- name: Install python env
|
||||
id: pipenv
|
||||
run: |
|
||||
cd examples/server/bench
|
||||
cd tools/server/bench
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
@@ -79,7 +79,7 @@ jobs:
|
||||
run: |
|
||||
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
|
||||
tar xzf prometheus*.tar.gz --strip-components=1
|
||||
./prometheus --config.file=examples/server/bench/prometheus.yml &
|
||||
./prometheus --config.file=tools/server/bench/prometheus.yml &
|
||||
while ! nc -z localhost 9090; do
|
||||
sleep 0.1
|
||||
done
|
||||
@@ -92,7 +92,7 @@ jobs:
|
||||
- name: Install k6 and xk6-sse
|
||||
id: k6_installation
|
||||
run: |
|
||||
cd examples/server/bench
|
||||
cd tools/server/bench
|
||||
go install go.k6.io/xk6/cmd/xk6@latest
|
||||
xk6 build master \
|
||||
--with github.com/phymbert/xk6-sse
|
||||
@@ -116,7 +116,7 @@ jobs:
|
||||
- name: Download the dataset
|
||||
id: download_dataset
|
||||
run: |
|
||||
cd examples/server/bench
|
||||
cd tools/server/bench
|
||||
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
- name: Server bench
|
||||
@@ -126,7 +126,7 @@ jobs:
|
||||
run: |
|
||||
set -eux
|
||||
|
||||
cd examples/server/bench
|
||||
cd tools/server/bench
|
||||
source venv/bin/activate
|
||||
python bench.py \
|
||||
--runner-label ${{ env.RUNNER_LABEL }} \
|
||||
@@ -157,9 +157,9 @@ jobs:
|
||||
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
||||
compression-level: 9
|
||||
path: |
|
||||
examples/server/bench/*.jpg
|
||||
examples/server/bench/*.json
|
||||
examples/server/bench/*.log
|
||||
tools/server/bench/*.jpg
|
||||
tools/server/bench/*.json
|
||||
tools/server/bench/*.log
|
||||
|
||||
- name: Commit status
|
||||
uses: Sibz/github-status-action@v1
|
||||
@@ -178,17 +178,17 @@ jobs:
|
||||
with:
|
||||
client_id: ${{secrets.IMGUR_CLIENT_ID}}
|
||||
path: |
|
||||
examples/server/bench/prompt_tokens_seconds.jpg
|
||||
examples/server/bench/predicted_tokens_seconds.jpg
|
||||
examples/server/bench/kv_cache_usage_ratio.jpg
|
||||
examples/server/bench/requests_processing.jpg
|
||||
tools/server/bench/prompt_tokens_seconds.jpg
|
||||
tools/server/bench/predicted_tokens_seconds.jpg
|
||||
tools/server/bench/kv_cache_usage_ratio.jpg
|
||||
tools/server/bench/requests_processing.jpg
|
||||
|
||||
- name: Extract mermaid
|
||||
id: set_mermaid
|
||||
run: |
|
||||
set -eux
|
||||
|
||||
cd examples/server/bench
|
||||
cd tools/server/bench
|
||||
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
|
||||
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
|
||||
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
|
||||
|
||||
66
.github/workflows/build-linux-cross.yml
vendored
66
.github/workflows/build-linux-cross.yml
vendored
@@ -4,18 +4,25 @@ on:
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
ubuntu-latest-riscv64-cpu-cross:
|
||||
runs-on: ubuntu-latest
|
||||
ubuntu-24-riscv64-cpu-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup Riscv
|
||||
run: |
|
||||
sudo dpkg --add-architecture riscv64
|
||||
sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
|
||||
/etc/apt/sources.list /etc/apt/apt-mirrors.txt
|
||||
sudo apt-get clean
|
||||
sudo apt-get update
|
||||
|
||||
# Add arch-specific repositories for non-amd64 architectures
|
||||
cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||
EOF
|
||||
|
||||
sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gcc-14-riscv64-linux-gnu \
|
||||
@@ -27,6 +34,7 @@ jobs:
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
||||
@@ -40,21 +48,25 @@ jobs:
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-latest-riscv64-vulkan-cross:
|
||||
runs-on: ubuntu-latest
|
||||
ubuntu-24-riscv64-vulkan-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Riscv
|
||||
run: |
|
||||
sudo dpkg --add-architecture riscv64
|
||||
sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
|
||||
/etc/apt/sources.list /etc/apt/apt-mirrors.txt
|
||||
sudo apt-get clean
|
||||
sudo apt-get update
|
||||
|
||||
# Add arch-specific repositories for non-amd64 architectures
|
||||
cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||
EOF
|
||||
|
||||
sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
glslc \
|
||||
@@ -69,6 +81,7 @@ jobs:
|
||||
-DGGML_VULKAN=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
||||
@@ -82,21 +95,25 @@ jobs:
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-latest-arm64-vulkan-cross:
|
||||
runs-on: ubuntu-latest
|
||||
ubuntu-24-arm64-vulkan-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Arm64
|
||||
run: |
|
||||
sudo dpkg --add-architecture arm64
|
||||
sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
|
||||
/etc/apt/sources.list /etc/apt/apt-mirrors.txt
|
||||
sudo apt-get clean
|
||||
sudo apt-get update
|
||||
|
||||
# Add arch-specific repositories for non-amd64 architectures
|
||||
cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
|
||||
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||
EOF
|
||||
|
||||
sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
glslc \
|
||||
@@ -110,6 +127,7 @@ jobs:
|
||||
-DGGML_VULKAN=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=aarch64 \
|
||||
|
||||
10
.github/workflows/build.yml
vendored
10
.github/workflows/build.yml
vendored
@@ -601,9 +601,8 @@ jobs:
|
||||
-DGGML_SYCL_F16=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
# Disabled for now due to sporadic issue syncing.
|
||||
# build-linux-cross:
|
||||
# uses: ./.github/workflows/build-linux-cross.yml
|
||||
build-linux-cross:
|
||||
uses: ./.github/workflows/build-linux-cross.yml
|
||||
|
||||
macOS-latest-cmake-ios:
|
||||
runs-on: macos-latest
|
||||
@@ -634,6 +633,7 @@ jobs:
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
@@ -670,6 +670,7 @@ jobs:
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||
@@ -700,6 +701,7 @@ jobs:
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||
@@ -740,6 +742,7 @@ jobs:
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
|
||||
@@ -1418,6 +1421,7 @@ jobs:
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
|
||||
24
.github/workflows/server.yml
vendored
24
.github/workflows/server.yml
vendored
@@ -15,10 +15,10 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
@@ -74,7 +74,7 @@ jobs:
|
||||
- name: Tests dependencies
|
||||
id: test_dependencies
|
||||
run: |
|
||||
pip install -r examples/server/tests/requirements.txt
|
||||
pip install -r tools/server/tests/requirements.txt
|
||||
|
||||
# Setup nodejs (to be used for verifying bundled index.html)
|
||||
- uses: actions/setup-node@v4
|
||||
@@ -84,14 +84,14 @@ jobs:
|
||||
- name: WebUI - Install dependencies
|
||||
id: webui_lint
|
||||
run: |
|
||||
cd examples/server/webui
|
||||
cd tools/server/webui
|
||||
npm ci
|
||||
|
||||
- name: WebUI - Check code format
|
||||
id: webui_format
|
||||
run: |
|
||||
git config --global --add safe.directory $(realpath .)
|
||||
cd examples/server/webui
|
||||
cd tools/server/webui
|
||||
git status
|
||||
|
||||
npm run format
|
||||
@@ -108,7 +108,7 @@ jobs:
|
||||
id: verify_server_index_html
|
||||
run: |
|
||||
git config --global --add safe.directory $(realpath .)
|
||||
cd examples/server/webui
|
||||
cd tools/server/webui
|
||||
git status
|
||||
|
||||
npm run build
|
||||
@@ -161,21 +161,21 @@ jobs:
|
||||
env:
|
||||
GITHUB_ACTIONS: "true"
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
cd tools/server/tests
|
||||
./tests.sh
|
||||
|
||||
- name: Tests (sanitizers)
|
||||
id: server_integration_tests_sanitizers
|
||||
if: ${{ matrix.sanitizer != '' }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
cd tools/server/tests
|
||||
LLAMA_SANITIZE=1 ./tests.sh
|
||||
|
||||
- name: Slow tests
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
cd tools/server/tests
|
||||
SLOW_TESTS=1 ./tests.sh
|
||||
|
||||
|
||||
@@ -211,7 +211,7 @@ jobs:
|
||||
- name: Tests dependencies
|
||||
id: test_dependencies
|
||||
run: |
|
||||
pip install -r examples/server/tests/requirements.txt
|
||||
pip install -r tools/server/tests/requirements.txt
|
||||
|
||||
- name: Copy Libcurl
|
||||
id: prepare_libcurl
|
||||
@@ -224,7 +224,7 @@ jobs:
|
||||
id: server_integration_tests
|
||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
cd tools/server/tests
|
||||
$env:PYTHONIOENCODING = ":replace"
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
@@ -232,6 +232,6 @@ jobs:
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
cd tools/server/tests
|
||||
$env:SLOW_TESTS = "1"
|
||||
pytest -v -x
|
||||
|
||||
12
.gitignore
vendored
12
.gitignore
vendored
@@ -96,11 +96,11 @@ perf-*.txt
|
||||
# Examples
|
||||
|
||||
examples/jeopardy/results.txt
|
||||
examples/server/*.css.hpp
|
||||
examples/server/*.html.hpp
|
||||
examples/server/*.js.hpp
|
||||
examples/server/*.mjs.hpp
|
||||
examples/server/*.gz.hpp
|
||||
tools/server/*.css.hpp
|
||||
tools/server/*.html.hpp
|
||||
tools/server/*.js.hpp
|
||||
tools/server/*.mjs.hpp
|
||||
tools/server/*.gz.hpp
|
||||
!build_64.sh
|
||||
!examples/*.bat
|
||||
!examples/*/*.kts
|
||||
@@ -110,7 +110,7 @@ examples/server/*.gz.hpp
|
||||
|
||||
# Server Web UI temporary files
|
||||
node_modules
|
||||
examples/server/webui/dist
|
||||
tools/server/webui/dist
|
||||
|
||||
# Python
|
||||
|
||||
|
||||
@@ -77,6 +77,7 @@ option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE
|
||||
|
||||
# extra artifacts
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
|
||||
@@ -187,6 +188,10 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
|
||||
add_subdirectory(pocs)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
|
||||
add_subdirectory(tools)
|
||||
endif()
|
||||
|
||||
#
|
||||
# install
|
||||
#
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
/ci/ @ggerganov
|
||||
/.devops/*.Dockerfile @ngxson
|
||||
/examples/server/ @ngxson
|
||||
/tools/server/ @ngxson
|
||||
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
|
||||
|
||||
92
Makefile
92
Makefile
@@ -1156,10 +1156,10 @@ $(LIB_COMMON_S): $(OBJ_COMMON)
|
||||
|
||||
# Clean generated server assets
|
||||
clean-server-assets:
|
||||
find examples/server -type f -name "*.js.hpp" -delete
|
||||
find examples/server -type f -name "*.mjs.hpp" -delete
|
||||
find examples/server -type f -name "*.css.hpp" -delete
|
||||
find examples/server -type f -name "*.html.hpp" -delete
|
||||
find tools/server -type f -name "*.js.hpp" -delete
|
||||
find tools/server -type f -name "*.mjs.hpp" -delete
|
||||
find tools/server -type f -name "*.css.hpp" -delete
|
||||
find tools/server -type f -name "*.html.hpp" -delete
|
||||
|
||||
# Clean rule
|
||||
clean: clean-server-assets
|
||||
@@ -1179,7 +1179,7 @@ clean: clean-server-assets
|
||||
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
|
||||
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
|
||||
|
||||
llama-cli: examples/main/main.cpp \
|
||||
llama-cli: tools/main/main.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@@ -1192,7 +1192,7 @@ llama-infill: examples/infill/infill.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-run: examples/run/run.cpp \
|
||||
llama-run: tools/run/run.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@@ -1207,7 +1207,7 @@ llama-simple-chat: examples/simple-chat/simple-chat.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-tokenize: examples/tokenize/tokenize.cpp \
|
||||
llama-tokenize: tools/tokenize/tokenize.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@@ -1217,27 +1217,27 @@ llama-batched: examples/batched/batched.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-batched-bench: examples/batched-bench/batched-bench.cpp \
|
||||
llama-batched-bench: tools/batched-bench/batched-bench.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-quantize: examples/quantize/quantize.cpp \
|
||||
llama-quantize: tools/quantize/quantize.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
|
||||
llama-quantize-stats: tools/quantize-stats/quantize-stats.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-perplexity: examples/perplexity/perplexity.cpp \
|
||||
llama-perplexity: tools/perplexity/perplexity.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-imatrix: examples/imatrix/imatrix.cpp \
|
||||
llama-imatrix: tools/imatrix/imatrix.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@@ -1279,7 +1279,7 @@ llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/s
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-gguf-split: examples/gguf-split/gguf-split.cpp \
|
||||
llama-gguf-split: tools/gguf-split/gguf-split.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@@ -1289,7 +1289,7 @@ llama-eval-callback: examples/eval-callback/eval-callback.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
|
||||
llama-cvector-generator: tools/cvector-generator/cvector-generator.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@@ -1299,12 +1299,12 @@ llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-bench: examples/llama-bench/llama-bench.cpp \
|
||||
llama-bench: tools/llama-bench/llama-bench.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-export-lora: examples/export-lora/export-lora.cpp \
|
||||
llama-export-lora: tools/export-lora/export-lora.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@@ -1360,17 +1360,17 @@ llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
ifdef GGML_RPC
|
||||
rpc-server: examples/rpc/rpc-server.cpp \
|
||||
rpc-server: tools/rpc/rpc-server.cpp \
|
||||
$(OBJ_GGML)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
endif # GGML_RPC
|
||||
|
||||
llama-server: \
|
||||
examples/server/server.cpp \
|
||||
examples/server/utils.hpp \
|
||||
examples/server/httplib.h \
|
||||
examples/server/index.html.hpp \
|
||||
examples/server/loading.html.hpp \
|
||||
tools/server/server.cpp \
|
||||
tools/server/utils.hpp \
|
||||
tools/server/httplib.h \
|
||||
tools/server/index.html.hpp \
|
||||
tools/server/loading.html.hpp \
|
||||
common/chat.cpp \
|
||||
common/chat.h \
|
||||
common/chat-template.hpp \
|
||||
@@ -1378,10 +1378,10 @@ llama-server: \
|
||||
common/minja.hpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Itools/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||
|
||||
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
|
||||
examples/server/%.hpp: examples/server/public/% FORCE Makefile
|
||||
# Portable equivalent of `cd tools/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
|
||||
tools/server/%.hpp: tools/server/public/% FORCE Makefile
|
||||
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
|
||||
echo "unsigned char $${NAME}[] = {" && \
|
||||
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
|
||||
@@ -1394,36 +1394,36 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
libllava.a: examples/llava/llava.cpp \
|
||||
examples/llava/llava.h \
|
||||
examples/llava/clip.cpp \
|
||||
examples/llava/clip.h \
|
||||
libllava.a: tools/llava/llava.cpp \
|
||||
tools/llava/llava.h \
|
||||
tools/llava/clip.cpp \
|
||||
tools/llava/clip.h \
|
||||
common/stb_image.h \
|
||||
common/base64.hpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
||||
|
||||
llama-llava-cli: examples/llava/llava-cli.cpp \
|
||||
examples/llava/llava.cpp \
|
||||
examples/llava/llava.h \
|
||||
examples/llava/clip.cpp \
|
||||
examples/llava/clip.h \
|
||||
llama-llava-cli: tools/llava/llava-cli.cpp \
|
||||
tools/llava/llava.cpp \
|
||||
tools/llava/llava.h \
|
||||
tools/llava/clip.cpp \
|
||||
tools/llava/clip.h \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
|
||||
examples/llava/llava.cpp \
|
||||
examples/llava/llava.h \
|
||||
examples/llava/clip.cpp \
|
||||
examples/llava/clip.h \
|
||||
llama-minicpmv-cli: tools/llava/minicpmv-cli.cpp \
|
||||
tools/llava/llava.cpp \
|
||||
tools/llava/llava.h \
|
||||
tools/llava/clip.cpp \
|
||||
tools/llava/clip.h \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
|
||||
examples/llava/llava.cpp \
|
||||
examples/llava/llava.h \
|
||||
examples/llava/clip.cpp \
|
||||
examples/llava/clip.h \
|
||||
llama-qwen2vl-cli: tools/llava/qwen2vl-cli.cpp \
|
||||
tools/llava/llava.cpp \
|
||||
tools/llava/llava.h \
|
||||
tools/llava/clip.cpp \
|
||||
tools/llava/clip.h \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
@@ -1480,12 +1480,12 @@ tests/test-double-float: tests/test-double-float.cpp
|
||||
|
||||
tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-chat: tests/test-chat.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-opt: tests/test-opt.cpp \
|
||||
|
||||
20
README.md
20
README.md
@@ -242,7 +242,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [Vulkan](docs/build.md#vulkan) | GPU |
|
||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) | All |
|
||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
||||
|
||||
## Building the project
|
||||
|
||||
@@ -276,9 +276,9 @@ The Hugging Face platform provides a variety of online tools for converting, qua
|
||||
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
|
||||
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)
|
||||
|
||||
To learn more about model quantization, [read this documentation](examples/quantize/README.md)
|
||||
To learn more about model quantization, [read this documentation](tools/quantize/README.md)
|
||||
|
||||
## [`llama-cli`](examples/main)
|
||||
## [`llama-cli`](tools/main)
|
||||
|
||||
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
|
||||
|
||||
@@ -341,7 +341,7 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||
</details>
|
||||
|
||||
|
||||
## [`llama-server`](examples/server)
|
||||
## [`llama-server`](tools/server)
|
||||
|
||||
#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
|
||||
|
||||
@@ -411,7 +411,7 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||
</details>
|
||||
|
||||
|
||||
## [`llama-perplexity`](examples/perplexity)
|
||||
## [`llama-perplexity`](tools/perplexity)
|
||||
|
||||
#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
|
||||
|
||||
@@ -436,10 +436,10 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||
|
||||
</details>
|
||||
|
||||
[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md)
|
||||
[^1]: [tools/perplexity/README.md](./tools/perplexity/README.md)
|
||||
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
|
||||
|
||||
## [`llama-bench`](examples/llama-bench)
|
||||
## [`llama-bench`](tools/llama-bench)
|
||||
|
||||
#### Benchmark the performance of the inference for various parameters.
|
||||
|
||||
@@ -460,7 +460,7 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||
|
||||
</details>
|
||||
|
||||
## [`llama-run`](examples/run)
|
||||
## [`llama-run`](tools/run)
|
||||
|
||||
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
|
||||
|
||||
@@ -504,8 +504,8 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||
|
||||
## Other documentation
|
||||
|
||||
- [main (cli)](examples/main/README.md)
|
||||
- [server](examples/server/README.md)
|
||||
- [main (cli)](tools/main/README.md)
|
||||
- [server](tools/server/README.md)
|
||||
- [GBNF grammars](grammars/README.md)
|
||||
|
||||
#### Development documentation
|
||||
|
||||
@@ -40,7 +40,7 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
|
||||
### Untrusted environments or networks
|
||||
|
||||
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
|
||||
* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
|
||||
* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
|
||||
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
|
||||
* Encrypt your data if sending it over the network.
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ TVOS_MIN_OS_VERSION=16.4
|
||||
|
||||
BUILD_SHARED_LIBS=OFF
|
||||
LLAMA_BUILD_EXAMPLES=OFF
|
||||
LLAMA_BUILD_TOOLS=OFF
|
||||
LLAMA_BUILD_TESTS=OFF
|
||||
LLAMA_BUILD_SERVER=OFF
|
||||
GGML_METAL=ON
|
||||
@@ -31,6 +32,7 @@ COMMON_CMAKE_ARGS=(
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
|
||||
-DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
|
||||
-DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
|
||||
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
|
||||
-DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
|
||||
-DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
|
||||
|
||||
@@ -187,8 +187,8 @@ function gg_run_test_scripts_debug {
|
||||
|
||||
set -e
|
||||
|
||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
(cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
(cd ./tools/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
|
||||
set +e
|
||||
}
|
||||
@@ -211,8 +211,8 @@ function gg_run_test_scripts_release {
|
||||
|
||||
set -e
|
||||
|
||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
(cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
(cd ./tools/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
@@ -2211,14 +2211,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj"}, "FILE",
|
||||
"path to a multimodal projector file. see examples/llava/README.md",
|
||||
"path to a multimodal projector file. see tools/llava/README.md",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj.path = value;
|
||||
}
|
||||
).set_examples(mmproj_examples));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj-url"}, "URL",
|
||||
"URL to a multimodal projector file. see examples/llava/README.md",
|
||||
"URL to a multimodal projector file. see tools/llava/README.md",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj.url = value;
|
||||
}
|
||||
@@ -2783,7 +2783,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
|
||||
add_opt(common_arg(
|
||||
{"--cache-reuse"}, "N",
|
||||
string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
|
||||
string_format(
|
||||
"min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
|
||||
"[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
|
||||
),
|
||||
[](common_params & params, int value) {
|
||||
params.n_cache_reuse = value;
|
||||
}
|
||||
|
||||
@@ -340,7 +340,7 @@ struct common_params {
|
||||
|
||||
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
||||
|
||||
// multimodal models (see examples/llava)
|
||||
// multimodal models (see tools/llava)
|
||||
struct common_params_model mmproj;
|
||||
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
||||
bool no_mmproj = false; // explicitly disable multimodal model
|
||||
@@ -414,8 +414,8 @@ struct common_params {
|
||||
int n_pca_batch = 100;
|
||||
int n_pca_iterations = 1000;
|
||||
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||
std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
|
||||
std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
|
||||
|
||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||
|
||||
|
||||
@@ -419,7 +419,9 @@ class ModelBase:
|
||||
@staticmethod
|
||||
def load_hparams(dir_model: Path):
|
||||
try:
|
||||
return AutoConfig.from_pretrained(dir_model).to_dict()
|
||||
# for security reason, we don't allow loading remote code by default
|
||||
# if a model need remote code, we will fallback to config.json
|
||||
return AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load model config from {dir_model}: {e}")
|
||||
logger.warning("Trying to load config.json instead")
|
||||
@@ -453,8 +455,12 @@ class ModelBase:
|
||||
|
||||
|
||||
class TextModel(ModelBase):
|
||||
model_type = ModelType.TEXT
|
||||
hf_arch: str
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.hf_arch = get_model_architecture(self.hparams, self.model_type)
|
||||
|
||||
if "text_config" in self.hparams:
|
||||
# move the text_config to the root level
|
||||
@@ -504,7 +510,7 @@ class TextModel(ModelBase):
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
|
||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
|
||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
|
||||
self.gguf_writer.add_context_length(n_ctx)
|
||||
logger.info(f"gguf: context length = {n_ctx}")
|
||||
|
||||
@@ -1073,10 +1079,36 @@ class TextModel(ModelBase):
|
||||
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
|
||||
self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
|
||||
|
||||
def _try_set_pooling_type(self) -> None:
|
||||
# get pooling path
|
||||
pooling_path = None
|
||||
module_path = self.dir_model / "modules.json"
|
||||
if module_path.is_file():
|
||||
with open(module_path, encoding="utf-8") as f:
|
||||
modules = json.load(f)
|
||||
for mod in modules:
|
||||
if mod["type"] == "sentence_transformers.models.Pooling":
|
||||
pooling_path = mod["path"]
|
||||
break
|
||||
|
||||
# get pooling type
|
||||
if pooling_path is not None:
|
||||
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
|
||||
pooling = json.load(f)
|
||||
if pooling["pooling_mode_mean_tokens"]:
|
||||
pooling_type = gguf.PoolingType.MEAN
|
||||
elif pooling["pooling_mode_cls_token"]:
|
||||
pooling_type = gguf.PoolingType.CLS
|
||||
elif pooling["pooling_mode_lasttoken"]:
|
||||
pooling_type = gguf.PoolingType.LAST
|
||||
else:
|
||||
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
|
||||
self.gguf_writer.add_pooling_type(pooling_type)
|
||||
|
||||
|
||||
class VisionModel(ModelBase):
|
||||
model_type = ModelType.VISION
|
||||
model_arch = gguf.MODEL_ARCH.CLIP_VISION
|
||||
n_text_embd = 0
|
||||
preprocessor_config: dict[str, Any]
|
||||
global_config: dict[str, Any]
|
||||
|
||||
@@ -1087,6 +1119,8 @@ class VisionModel(ModelBase):
|
||||
raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION")
|
||||
|
||||
# get n_embd of the text model
|
||||
if "text_config" not in self.hparams:
|
||||
self.hparams["text_config"] = {}
|
||||
text_config = {**self.hparams, **self.hparams["text_config"]}
|
||||
self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
|
||||
assert self.n_embd_text > 0, "n_embd not found in hparams"
|
||||
@@ -2089,6 +2123,9 @@ class DeciModel(TextModel):
|
||||
# if n_heads_in_group is not None, then
|
||||
# _num_kv_heads[il] is num_attention_head // n_heads_in_group and
|
||||
# _num_heads[il] is num_attention_head
|
||||
# ***dummy layer*** for nemotron 253B
|
||||
# if n_heads_in_group is None and ffn_mult is None
|
||||
# then _num_kv_heads[il] is 0 and _num_heads[il] is 0 and _ffn_dims is 0
|
||||
for il in range(len(_block_configs)):
|
||||
if _block_configs[il]["attention"]["n_heads_in_group"] is None:
|
||||
if _block_configs[il]["attention"]["replace_with_linear"] is True:
|
||||
@@ -2100,7 +2137,10 @@ class DeciModel(TextModel):
|
||||
else:
|
||||
self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
|
||||
self._num_heads.append(self.hparams["num_attention_heads"])
|
||||
_ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
|
||||
if _block_configs[il]["ffn"]["ffn_mult"] is None: # dummy layer
|
||||
_ffn_multipliers.append(0.0)
|
||||
else:
|
||||
_ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
|
||||
assert self.block_count == len(self._num_kv_heads)
|
||||
assert self.block_count == len(self._num_heads)
|
||||
assert self.block_count == len(_ffn_multipliers)
|
||||
@@ -2538,7 +2578,7 @@ class QwenModel(TextModel):
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2ForCausalLM")
|
||||
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM")
|
||||
class Qwen2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN2
|
||||
|
||||
@@ -2550,12 +2590,18 @@ class Qwen2Model(TextModel):
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self._try_set_pooling_type()
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if self.hf_arch == "Qwen2Model":
|
||||
name = f"model.{name}" # map to Qwen2ForCausalLM tensors
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
|
||||
class Qwen2VLModel(TextModel):
|
||||
@@ -2581,6 +2627,82 @@ class Qwen2VLModel(TextModel):
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
|
||||
class Qwen2VLVisionModel(VisionModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.hparams["image_size"] = self.hparams.get("image_size", 560)
|
||||
# rename config.json values
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_heads")
|
||||
self.hparams["num_hidden_layers"] = self.hparams.get("depth")
|
||||
if "embed_dim" in self.hparams: # qwen2vl
|
||||
self.hparams["intermediate_size"] = self.hparams.get("hidden_size")
|
||||
self.hparams["hidden_size"] = self.hparams.get("embed_dim")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
if self.global_config['model_type'] == 'qwen2_vl':
|
||||
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN2VL)
|
||||
elif self.global_config['model_type'] == 'qwen2_5_vl':
|
||||
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN25VL)
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
# find n_wa_pattern (window attention pattern)
|
||||
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
|
||||
assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl"
|
||||
n_wa_pattern = fullatt_block_indexes[0] + 1
|
||||
# validate n_wa_pattern
|
||||
for i in range(1, len(fullatt_block_indexes)):
|
||||
if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern:
|
||||
raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}")
|
||||
self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
|
||||
else:
|
||||
raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}")
|
||||
# default values below are taken from HF tranformers code
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6))
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, name, n_dims # unused
|
||||
if ".patch_embd." in new_name:
|
||||
return gguf.GGMLQuantizationType.F16
|
||||
if ".position_embd." in new_name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return False
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
if name.startswith("visual."):
|
||||
# process visual tensors
|
||||
# split QKV tensors if needed
|
||||
if ".qkv." in name:
|
||||
if data_torch.ndim == 2: # weight
|
||||
c3, _ = data_torch.shape
|
||||
else: # bias
|
||||
c3 = data_torch.shape[0]
|
||||
assert c3 % 3 == 0
|
||||
c = c3 // 3
|
||||
wq = data_torch[:c]
|
||||
wk = data_torch[c: c * 2]
|
||||
wv = data_torch[c * 2:]
|
||||
return [
|
||||
(self.map_tensor_name(name.replace("qkv", "q")), wq),
|
||||
(self.map_tensor_name(name.replace("qkv", "k")), wk),
|
||||
(self.map_tensor_name(name.replace("qkv", "v")), wv),
|
||||
]
|
||||
elif 'patch_embed.proj.weight' in name:
|
||||
# split Conv3D into Conv2Ds
|
||||
c1, c2, kt, kh, kw = data_torch.shape
|
||||
del c1, c2, kh, kw # unused
|
||||
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
|
||||
return [
|
||||
(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...]),
|
||||
(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]),
|
||||
]
|
||||
else:
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
return [] # skip other tensors
|
||||
|
||||
|
||||
@ModelBase.register("WavTokenizerDec")
|
||||
class WavTokenizerDecModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
|
||||
@@ -3316,29 +3438,7 @@ class BertModel(TextModel):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_causal_attention(False)
|
||||
|
||||
# get pooling path
|
||||
pooling_path = None
|
||||
module_path = self.dir_model / "modules.json"
|
||||
if module_path.is_file():
|
||||
with open(module_path, encoding="utf-8") as f:
|
||||
modules = json.load(f)
|
||||
for mod in modules:
|
||||
if mod["type"] == "sentence_transformers.models.Pooling":
|
||||
pooling_path = mod["path"]
|
||||
break
|
||||
|
||||
# get pooling type
|
||||
if pooling_path is not None:
|
||||
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
|
||||
pooling = json.load(f)
|
||||
if pooling["pooling_mode_mean_tokens"]:
|
||||
pooling_type = gguf.PoolingType.MEAN
|
||||
elif pooling["pooling_mode_cls_token"]:
|
||||
pooling_type = gguf.PoolingType.CLS
|
||||
else:
|
||||
raise NotImplementedError("Only MEAN and CLS pooling types supported")
|
||||
self.gguf_writer.add_pooling_type(pooling_type)
|
||||
self._try_set_pooling_type()
|
||||
|
||||
def set_vocab(self):
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
@@ -3547,8 +3647,13 @@ class NomicBertModel(BertModel):
|
||||
if self._tokenizer_is_xlmroberta:
|
||||
self._xlmroberta_tokenizer_init()
|
||||
|
||||
# the HF config claims n_ctx=8192, but it uses RoPE scaling
|
||||
self.hparams["n_ctx"] = 2048
|
||||
npos, mtp = self.hparams["n_positions"], self.hparams.get("max_trained_positions", 2048)
|
||||
if npos == 8192 and mtp == 2048:
|
||||
self.hparams["n_positions"] = 2048 # nomic-embed-text v1 and v1.5 are trained for 2048 tokens.
|
||||
elif npos == 2048 and mtp == 2048:
|
||||
self.hparams["n_positions"] = 512 # nomic-embed-text-v2-moe is trained for 512 tokens.
|
||||
else:
|
||||
raise ValueError(f"unrecognized parameters: n_positions={npos}, max_trained_positions={mtp}")
|
||||
|
||||
assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu"
|
||||
|
||||
@@ -5877,8 +5982,7 @@ def split_str_to_n_bytes(split_str: str) -> int:
|
||||
return n
|
||||
|
||||
|
||||
def get_model_architecture(dir_model: Path, model_type: ModelType, hparams: Any = None) -> str:
|
||||
hparams = ModelBase.load_hparams(dir_model) if hparams is None else hparams
|
||||
def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
|
||||
text_config = hparams.get("text_config", {})
|
||||
vision_config = hparams.get("vision_config", {})
|
||||
arch = hparams["architectures"][0]
|
||||
@@ -5949,7 +6053,8 @@ def main() -> None:
|
||||
with torch.inference_mode():
|
||||
output_type = ftype_map[args.outtype]
|
||||
model_type = ModelType.VISION if args.mmproj else ModelType.TEXT
|
||||
model_architecture = get_model_architecture(dir_model, model_type)
|
||||
hparams = ModelBase.load_hparams(dir_model)
|
||||
model_architecture = get_model_architecture(hparams, model_type)
|
||||
logger.info(f"Model architecture: {model_architecture}")
|
||||
try:
|
||||
model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
|
||||
|
||||
@@ -9,10 +9,10 @@ Adding a model requires few steps:
|
||||
After following these steps, you can open PR.
|
||||
|
||||
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
|
||||
- [main](/examples/main/)
|
||||
- [imatrix](/examples/imatrix/)
|
||||
- [quantize](/examples/quantize/)
|
||||
- [server](/examples/server/)
|
||||
- [main](/tools/main/)
|
||||
- [imatrix](/tools/imatrix/)
|
||||
- [quantize](/tools/quantize/)
|
||||
- [server](/tools/server/)
|
||||
|
||||
### 1. Convert the model to GGUF
|
||||
|
||||
|
||||
@@ -33,13 +33,13 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||
2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
|
||||
python ./tools/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
|
||||
```
|
||||
|
||||
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py \
|
||||
python ./tools/llava/convert_image_encoder_to_gguf.py \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B \
|
||||
@@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf.py \
|
||||
```
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py \
|
||||
python ./tools/llava/convert_image_encoder_to_gguf.py \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B_V2 \
|
||||
@@ -69,10 +69,10 @@ Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directo
|
||||
|
||||
## Android compile and run
|
||||
### compile
|
||||
refer to `examples/llava/android/build_64.sh`
|
||||
refer to `tools/llava/android/build_64.sh`
|
||||
```sh
|
||||
mkdir examples/llava/android/build_64
|
||||
cd examples/llava/android/build_64
|
||||
mkdir tools/llava/android/build_64
|
||||
cd tools/llava/android/build_64
|
||||
../build_64.sh
|
||||
```
|
||||
### run on Android
|
||||
|
||||
@@ -25,13 +25,13 @@ git clone https://huggingface.co/THUDM/glm-edge-v-5b or https://huggingface.co/T
|
||||
2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/glmedge-surgery.py -m ../model_path
|
||||
python ./tools/llava/glmedge-surgery.py -m ../model_path
|
||||
```
|
||||
|
||||
4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
|
||||
python ./tools/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
|
||||
```
|
||||
|
||||
5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF:
|
||||
|
||||
@@ -37,19 +37,19 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||
2. Install the required Python packages:
|
||||
|
||||
```sh
|
||||
pip install -r examples/llava/requirements.txt
|
||||
pip install -r tools/llava/requirements.txt
|
||||
```
|
||||
|
||||
3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
|
||||
python ./tools/llava/llava_surgery.py -m ../llava-v1.5-7b
|
||||
```
|
||||
|
||||
4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
||||
python ./tools/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
||||
```
|
||||
|
||||
5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||
@@ -69,12 +69,12 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
|
||||
2) Install the required Python packages:
|
||||
|
||||
```sh
|
||||
pip install -r examples/llava/requirements.txt
|
||||
pip install -r tools/llava/requirements.txt
|
||||
```
|
||||
|
||||
3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
||||
```console
|
||||
python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
|
||||
python tools/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
|
||||
```
|
||||
- you will find a llava.projector and a llava.clip file in your model directory
|
||||
|
||||
@@ -88,7 +88,7 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso
|
||||
|
||||
5) Create the visual gguf model:
|
||||
```console
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
||||
python ./tools/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
||||
```
|
||||
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
|
||||
|
||||
|
||||
@@ -29,8 +29,8 @@ cmake --build build --config Release
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
||||
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
|
||||
python ./tools/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
||||
python ./tools/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
|
||||
@@ -28,8 +28,8 @@ cmake --build build --config Release
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
||||
python ./tools/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||
python ./tools/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
||||
|
||||
# quantize int4 version
|
||||
|
||||
@@ -28,8 +28,8 @@ cmake --build build --config Release
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
|
||||
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
|
||||
python ./tools/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
|
||||
python ./tools/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
|
||||
@@ -12,51 +12,30 @@ llama_add_compile_flags()
|
||||
|
||||
# examples
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(batched-bench)
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(eval-callback)
|
||||
|
||||
add_subdirectory(gguf-hash)
|
||||
add_subdirectory(gguf-split)
|
||||
add_subdirectory(gguf)
|
||||
add_subdirectory(gritlm)
|
||||
add_subdirectory(imatrix)
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(lookahead)
|
||||
add_subdirectory(lookup)
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(passkey)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(retrieval)
|
||||
if (LLAMA_BUILD_SERVER)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(run)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(simple-chat)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(speculative-simple)
|
||||
add_subdirectory(tokenize)
|
||||
add_subdirectory(tts)
|
||||
add_subdirectory(gen-docs)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
# these examples use the backends directly and cannot be built with dynamic loading
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(cvector-generator)
|
||||
add_subdirectory(export-lora)
|
||||
add_subdirectory(llava)
|
||||
if (GGML_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
# these examples use the backends directly and cannot be built with dynamic loading
|
||||
if (GGML_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
endif()
|
||||
|
||||
@@ -1,217 +0,0 @@
|
||||
import argparse
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from gguf import *
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
Qwen2VLConfig,
|
||||
Qwen2VLProcessor,
|
||||
Qwen2VLForConditionalGeneration,
|
||||
Qwen2_5_VLConfig, # type: ignore[reportAttributeAccessIssue]
|
||||
Qwen2_5_VLForConditionalGeneration, # type: ignore[reportAttributeAccessIssue]
|
||||
)
|
||||
|
||||
|
||||
VISION = "clip.vision"
|
||||
|
||||
|
||||
def k(raw_key: str, arch: str) -> str:
|
||||
return raw_key.format(arch=arch)
|
||||
|
||||
|
||||
def get_n_wa_pattern(fullatt_block_indexes: Optional[List[int]]):
|
||||
if fullatt_block_indexes is None:
|
||||
return 0
|
||||
n_wa = fullatt_block_indexes[0]
|
||||
for a, b in zip(fullatt_block_indexes, fullatt_block_indexes[1:]):
|
||||
if b - a - 1 != n_wa:
|
||||
raise ValueError(
|
||||
f"window/full attention layer should have fix pattern of "
|
||||
f"for each full-attention layer followed by {n_wa} window-attention layers"
|
||||
)
|
||||
return n_wa + 1
|
||||
|
||||
|
||||
class VL2:
|
||||
|
||||
@staticmethod
|
||||
def to_gguf_name(name: str) -> str:
|
||||
og = name
|
||||
name = name.replace("text_model", "t").replace("vision_model", "v")
|
||||
name = name.replace("blocks", "blk").replace("embeddings.", "")
|
||||
name = name.replace("attn.", "attn_")
|
||||
name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
|
||||
# name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
|
||||
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
|
||||
name = name.replace("merger.mlp", 'mm')
|
||||
print(f"[to_gguf_name] {og} --> {name}")
|
||||
return name
|
||||
|
||||
@classmethod
|
||||
def find_vision_tensors(cls, qwen2vl, dtype) -> Dict[str, np.ndarray]:
|
||||
vision_model = qwen2vl.visual
|
||||
tensor_map = {}
|
||||
for name, ten in vision_model.state_dict().items():
|
||||
ten = ten.numpy()
|
||||
if 'qkv' in name:
|
||||
if ten.ndim == 2: # weight
|
||||
c3, _ = ten.shape
|
||||
else: # bias
|
||||
c3 = ten.shape[0]
|
||||
assert c3 % 3 == 0
|
||||
c = c3 // 3
|
||||
wq = ten[:c]
|
||||
wk = ten[c: c * 2]
|
||||
wv = ten[c * 2:]
|
||||
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
|
||||
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
|
||||
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
|
||||
elif 'merger' in name:
|
||||
if name.endswith("ln_q.weight"):
|
||||
tensor_map['v.post_ln.weight'] = ten
|
||||
elif name.endswith("ln_q.bias"):
|
||||
tensor_map['v.post_ln.bias'] = ten
|
||||
else:
|
||||
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
|
||||
tensor_map[cls.to_gguf_name(name)] = ten
|
||||
elif 'patch_embed.proj.weight' in name:
|
||||
# NOTE: split Conv3D into Conv2Ds
|
||||
c1, c2, kt, kh, kw = ten.shape
|
||||
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
|
||||
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
|
||||
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
|
||||
else:
|
||||
tensor_map[cls.to_gguf_name(f"vision_model.{name}")] = ten
|
||||
|
||||
for new_name, ten in tensor_map.items():
|
||||
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
|
||||
tensor_map[new_name] = ten.astype(np.float32)
|
||||
else:
|
||||
tensor_map[new_name] = ten.astype(dtype)
|
||||
tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder
|
||||
return tensor_map
|
||||
|
||||
|
||||
class VL25(VL2):
|
||||
|
||||
@staticmethod
|
||||
def to_gguf_name(name: str) -> str:
|
||||
og = name
|
||||
name = name.replace("text_model", "t").replace("vision_model", "v")
|
||||
name = name.replace("blocks", "blk").replace("embeddings.", "")
|
||||
name = name.replace("attn.", "attn_")
|
||||
name = name.replace("mlp.down_proj", "ffn_down").replace("mlp.up_proj", "ffn_up")
|
||||
name = name.replace("mlp.gate_proj", "ffn_gate").replace("proj.", "out.")
|
||||
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
|
||||
name = name.replace("merger.mlp", 'mm')
|
||||
print(f"[vl25][to_gguf_name] {og} --> {name}")
|
||||
return name
|
||||
|
||||
|
||||
def main(args):
|
||||
if args.data_type == 'fp32':
|
||||
dtype = torch.float32
|
||||
np_dtype = np.float32
|
||||
ftype = 0
|
||||
elif args.data_type == 'fp16':
|
||||
dtype = torch.float16
|
||||
np_dtype = np.float16
|
||||
ftype = 1
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
local_model = False
|
||||
model_path = ""
|
||||
model_name = args.model_name
|
||||
print("model_name: ", model_name)
|
||||
if args.model_type == "qwen2vl":
|
||||
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
|
||||
model_name, torch_dtype=dtype, device_map="cpu"
|
||||
)
|
||||
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
|
||||
vcfg = cfg.vision_config
|
||||
else:
|
||||
qwen2vl = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
||||
model_name, torch_dtype=dtype, device_map="cpu"
|
||||
)
|
||||
cfg: Qwen2_5_VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
|
||||
vcfg = cfg.vision_config
|
||||
|
||||
if os.path.isdir(model_name):
|
||||
local_model = True
|
||||
if model_name.endswith(os.sep):
|
||||
model_name = model_name[:-1]
|
||||
model_path = model_name
|
||||
model_name = os.path.basename(model_name)
|
||||
fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
|
||||
|
||||
fout = GGUFWriter(path=fname_out, arch="clip")
|
||||
fout.add_description("image encoder for Qwen2VL")
|
||||
|
||||
fout.add_file_type(ftype)
|
||||
fout.add_bool("clip.has_text_encoder", False)
|
||||
fout.add_bool("clip.has_vision_encoder", True)
|
||||
fout.add_bool("clip.has_qwen2vl_merger", True)
|
||||
|
||||
print(cfg.vision_config)
|
||||
if 'silu' in cfg.vision_config.hidden_act.lower():
|
||||
fout.add_bool("clip.use_silu", True)
|
||||
fout.add_bool("clip.use_gelu", False)
|
||||
elif 'gelu' in cfg.vision_config.hidden_act.lower():
|
||||
fout.add_bool("clip.use_silu", False)
|
||||
fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower())
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
if args.model_type == "qwen2.5vl":
|
||||
fout.add_uint32("clip.vision.n_wa_pattern", get_n_wa_pattern(vcfg.fullatt_block_indexes))
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.hidden_size)
|
||||
fout.add_uint32("clip.vision.projection_dim", vcfg.out_hidden_size)
|
||||
fout.add_string("clip.projector_type", "qwen2.5vl_merger")
|
||||
else:
|
||||
fout.add_string("clip.projector_type", "qwen2vl_merger")
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
|
||||
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
|
||||
|
||||
if args.model_type == "qwen2.5vl":
|
||||
tensor_map = VL25.find_vision_tensors(qwen2vl, np_dtype)
|
||||
else:
|
||||
tensor_map = VL2.find_vision_tensors(qwen2vl, np_dtype)
|
||||
for name, data in tensor_map.items():
|
||||
fout.add_tensor(name, data)
|
||||
|
||||
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
|
||||
fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2)
|
||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
|
||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
|
||||
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0) # not sure what this does, put 0 here as a placeholder
|
||||
fout.add_name(model_name)
|
||||
"""
|
||||
HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
|
||||
it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
|
||||
"""
|
||||
|
||||
if local_model:
|
||||
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path)
|
||||
else:
|
||||
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
|
||||
fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]
|
||||
fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]
|
||||
|
||||
fout.write_header_to_file()
|
||||
fout.write_kv_data_to_file()
|
||||
fout.write_tensors_to_file()
|
||||
fout.close()
|
||||
print("save model as: ", fname_out)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
|
||||
parser.add_argument("--model_type", nargs='?', choices=['qwen2vl', 'qwen2.5vl'], default="qwen2vl")
|
||||
parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@@ -23,7 +23,7 @@ def create_completion(host, prompt, gbnf_grammar):
|
||||
"""Calls the /completion API on llama-server.
|
||||
|
||||
See
|
||||
https://github.com/ggml-org/llama.cpp/tree/HEAD/examples/server#api-endpoints
|
||||
https://github.com/ggml-org/llama.cpp/tree/HEAD/tools/server#api-endpoints
|
||||
"""
|
||||
print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}")
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
Binary file not shown.
@@ -1054,6 +1054,493 @@ class tinyBLAS_Q0_AVX {
|
||||
} \
|
||||
} \
|
||||
|
||||
template <typename TA, typename TB, typename TC>
|
||||
class tinyBLAS_BF16_PPC {
|
||||
public:
|
||||
tinyBLAS_BF16_PPC(int64_t k,
|
||||
const TA *A, int64_t lda,
|
||||
const TB *B, int64_t ldb,
|
||||
TC *C, int64_t ldc,
|
||||
int ith, int nth)
|
||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||
}
|
||||
|
||||
void matmul(int64_t m, int64_t n) {
|
||||
mnpack(0, m, 0, n);
|
||||
}
|
||||
|
||||
private:
|
||||
void vector_permute_store(vec_t *c, int numVec, unsigned char *vecOffset) {
|
||||
vec_t t[8], s[8];
|
||||
vec_t swiz1 = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23};
|
||||
vec_t swiz2 = {8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31};
|
||||
vec_t swiz3 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
||||
vec_t swiz4 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||
|
||||
if (numVec == 2) {
|
||||
t[0] = vec_perm(c[0], c[1], swiz1);
|
||||
t[1] = vec_perm(c[2], c[3], swiz1);
|
||||
s[0] = vec_perm(t[0], t[1], swiz3);
|
||||
s[1] = vec_perm(t[0], t[1], swiz4);
|
||||
vec_xst(s[0], 0, (vec_t*)vecOffset);
|
||||
vec_xst(s[1], 0, (vec_t*)(vecOffset + 16));
|
||||
} else if (numVec == 4) {
|
||||
t[0] = vec_perm(c[0], c[1], swiz1);
|
||||
t[1] = vec_perm(c[0], c[1], swiz2);
|
||||
t[2] = vec_perm(c[2], c[3], swiz1);
|
||||
t[3] = vec_perm(c[2], c[3], swiz2);
|
||||
s[0] = vec_perm(t[0], t[2], swiz3);
|
||||
s[1] = vec_perm(t[0], t[2], swiz4);
|
||||
s[2] = vec_perm(t[1], t[3], swiz3);
|
||||
s[3] = vec_perm(t[1], t[3], swiz4);
|
||||
for (int i = 0; i < 4; ++i)
|
||||
vec_xst(s[i], 0, (vec_t*)(vecOffset + i * 16));
|
||||
} else if (numVec == 8) {
|
||||
for (int i = 0; i < 4; i += 2) {
|
||||
t[i+0] = vec_perm(c[i+0], c[i+1], swiz1);
|
||||
t[i+1] = vec_perm(c[i+0], c[i+1], swiz2);
|
||||
}
|
||||
for (int i = 4; i < 8; i += 2) {
|
||||
t[i+0] = vec_perm(c[i+0], c[i+1], swiz1);
|
||||
t[i+1] = vec_perm(c[i+0], c[i+1], swiz2);
|
||||
}
|
||||
s[0] = vec_perm(t[0], t[2], swiz3);
|
||||
s[1] = vec_perm(t[0], t[2], swiz4);
|
||||
s[2] = vec_perm(t[1], t[3], swiz3);
|
||||
s[3] = vec_perm(t[1], t[3], swiz4);
|
||||
s[4] = vec_perm(t[4], t[6], swiz3);
|
||||
s[5] = vec_perm(t[4], t[6], swiz4);
|
||||
s[6] = vec_perm(t[5], t[7], swiz3);
|
||||
s[7] = vec_perm(t[5], t[7], swiz4);
|
||||
for (int i = 0; i < 8; ++i)
|
||||
vec_xst(s[i], 0, (vec_t*)(vecOffset + i * 16));
|
||||
}
|
||||
}
|
||||
|
||||
void packNormal(const TA* a, int64_t lda, int rows, int cols, unsigned char* vec) {
|
||||
int64_t i, j;
|
||||
TA *aoffset = NULL;
|
||||
unsigned char *vecOffset = NULL;
|
||||
TA * aoffsets[8];
|
||||
vector unsigned char c_arr[8];
|
||||
aoffset = const_cast<TA*>(a);
|
||||
vecOffset = vec;
|
||||
j = (rows >> 3);
|
||||
if (j > 0) {
|
||||
do {
|
||||
if (cols == 4) {
|
||||
aoffsets[0] = aoffset;
|
||||
for (int it = 1; it < 4; ++it)
|
||||
aoffsets[it] = aoffsets[it-1] + lda;
|
||||
aoffset += 4 * lda;
|
||||
for (int i = 0; i < 4; ++i)
|
||||
c_arr[i] = vec_xl(0, (vector unsigned char*)aoffsets[i]);
|
||||
vector_permute_store(c_arr, 4, vecOffset);
|
||||
for (int i = 0; i<4; i++)
|
||||
aoffsets[i] = aoffsets[i]+lda;
|
||||
vecOffset +=64;
|
||||
}
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
aoffsets[0] = aoffset;
|
||||
for (int it = 1; it < 8; ++it) {
|
||||
aoffsets[it] = aoffsets[it-1] + lda;
|
||||
}
|
||||
aoffset += 8 * lda;
|
||||
do {
|
||||
for (int it = 0; it < 8; ++it)
|
||||
c_arr[it] = vec_xl(0, (vector unsigned char*)aoffsets[it]);
|
||||
vector_permute_store(c_arr, 8, vecOffset);
|
||||
for (int it = 0; it < 8; ++it)
|
||||
aoffsets[it] = aoffsets[it] + 8*lda;
|
||||
vecOffset += 128;
|
||||
i--;
|
||||
} while(i > 0);
|
||||
}
|
||||
j--;
|
||||
} while(j > 0);
|
||||
}
|
||||
if (rows & 4) {
|
||||
aoffsets[0] = aoffset;
|
||||
for (int it = 1; it < 4; ++it)
|
||||
aoffsets[it] = aoffsets[it-1] + lda;
|
||||
aoffset += 4 * lda;
|
||||
if (cols == 4) {
|
||||
for (int it = 0; it < 4; ++it)
|
||||
c_arr[it] = vec_xl(0, (vector unsigned char*)aoffsets[it]);
|
||||
vector_permute_store(c_arr, 2, vecOffset);
|
||||
for (int it = 0; it< 4; it++)
|
||||
aoffsets[it] = aoffsets[it] + lda;
|
||||
vecOffset += 32;
|
||||
}
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
do {
|
||||
for (int it = 0; it < 4; ++it)
|
||||
c_arr[it] = vec_xl(0, (vector unsigned char*)aoffsets[it]);
|
||||
vector_permute_store(c_arr, 4, vecOffset);
|
||||
for (int it = 0; it< 4; it++)
|
||||
aoffsets[it] = aoffsets[it] + 8*lda;
|
||||
vecOffset += 64;
|
||||
i--;
|
||||
} while(i > 0);
|
||||
}
|
||||
}
|
||||
if (rows & 3) {
|
||||
aoffsets[0] = aoffset;
|
||||
for (int it = 1; it < 4; ++it)
|
||||
aoffsets[it] = aoffsets[it-1] + lda;
|
||||
if (cols == 4) {
|
||||
switch(rows) {
|
||||
case 3: c_arr[2] = vec_xl(0, (vector unsigned char*)aoffsets[2]);
|
||||
case 2: c_arr[1] = vec_xl(0, (vector unsigned char*)aoffsets[1]);
|
||||
case 1: c_arr[0] = vec_xl(0, (vector unsigned char*)aoffsets[0]);
|
||||
break;
|
||||
}
|
||||
vector_permute_store(c_arr, 2, vecOffset);
|
||||
for (int it = 0; it< 4; it++)
|
||||
aoffsets[it] = aoffsets[it] + lda;
|
||||
vecOffset += 32;
|
||||
}
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
do {
|
||||
switch(rows) {
|
||||
case 3: c_arr[2] = vec_xl(0, (vector unsigned char*)aoffsets[2]);
|
||||
case 2: c_arr[1] = vec_xl(0, (vector unsigned char*)aoffsets[1]);
|
||||
case 1: c_arr[0] = vec_xl(0, (vector unsigned char*)aoffsets[0]);
|
||||
break;
|
||||
}
|
||||
vector_permute_store(c_arr, 4, vecOffset);
|
||||
for (int it = 0; it <4; it++)
|
||||
aoffsets[it] = aoffsets[it] + 8* lda;
|
||||
vecOffset += 64;
|
||||
i--;
|
||||
} while(i > 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t mc, nc, mp, np;
|
||||
int m_rem = MIN(m - m0, 8);
|
||||
int n_rem = MIN(n - n0, 8);
|
||||
|
||||
if (m_rem >= 8 && n_rem >= 8) {
|
||||
mc = 8;
|
||||
nc = 8;
|
||||
gemm<8,8>(m0, m, n0, n);
|
||||
} else if (m_rem >= 4 && n_rem >= 8) {
|
||||
mc = 4;
|
||||
nc = 8;
|
||||
gemm<4,8>(m0, m, n0, n);
|
||||
} else if (m_rem >=8 && n_rem >=4){
|
||||
mc = 8;
|
||||
nc = 4;
|
||||
gemm<8,4>(m0, m, n0, n);
|
||||
} else if ((m_rem < 4) && (n_rem >= 8)) {
|
||||
nc = 8;
|
||||
switch(m_rem) {
|
||||
case 1:
|
||||
mc = 1;
|
||||
gemm_Mx8<1>(m0, m, n0, n);
|
||||
break;
|
||||
case 2:
|
||||
mc = 2;
|
||||
gemm_Mx8<2>(m0, m, n0, n);
|
||||
break;
|
||||
case 3:
|
||||
mc = 3;
|
||||
gemm_Mx8<3>(m0, m, n0, n);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
} else if (m_rem >= 4 && n_rem >= 4) {
|
||||
mc = 4;
|
||||
nc = 4;
|
||||
gemm_small<4, 4>(m0, m, n0, n);
|
||||
} else if ((m_rem > 4) && (n_rem < 4)) {
|
||||
mc = 4;
|
||||
switch(n_rem) {
|
||||
case 1:
|
||||
nc = 1;
|
||||
gemm_small<4, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 2:
|
||||
nc = 2;
|
||||
gemm_small<4, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 3:
|
||||
nc = 3;
|
||||
gemm_small<4, 3>(m0, m, n0, n);
|
||||
break;
|
||||
|
||||
default:
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
switch((m_rem << 4) | n_rem) {
|
||||
case 0x43:
|
||||
mc = 4;
|
||||
nc = 3;
|
||||
gemm_small<4, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x42:
|
||||
mc = 4;
|
||||
nc = 2;
|
||||
gemm_small<4, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x41:
|
||||
mc = 4;
|
||||
nc = 1;
|
||||
gemm_small<4, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x34:
|
||||
mc = 3;
|
||||
nc = 4;
|
||||
gemm_small<3, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x33:
|
||||
mc = 3;
|
||||
nc = 3;
|
||||
gemm_small<3, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x32:
|
||||
mc = 3;
|
||||
nc = 2;
|
||||
gemm_small<3, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x31:
|
||||
mc = 3;
|
||||
nc = 1;
|
||||
gemm_small<3, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x24:
|
||||
mc = 2;
|
||||
nc = 4;
|
||||
gemm_small<2,4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x23:
|
||||
mc = 2;
|
||||
nc = 3;
|
||||
gemm_small<2, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x22:
|
||||
mc = 2;
|
||||
nc = 2;
|
||||
gemm_small<2, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x21:
|
||||
mc = 2;
|
||||
nc = 1;
|
||||
gemm_small<2, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x14:
|
||||
mc = 1;
|
||||
nc = 4;
|
||||
gemm_small<1, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x13:
|
||||
mc = 1;
|
||||
nc = 3;
|
||||
gemm_small<1, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x12:
|
||||
mc = 1;
|
||||
nc = 2;
|
||||
gemm_small<1, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x11:
|
||||
mc = 1;
|
||||
nc = 1;
|
||||
gemm_small<1, 1>(m0, m, n0, n);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}
|
||||
mp = m0 + (m - m0) / mc * mc;
|
||||
np = n0 + (n - n0) / nc * nc;
|
||||
mnpack(mp, m, n0, np);
|
||||
mnpack(m0, m, np, n);
|
||||
}
|
||||
|
||||
void KERNEL_4x8(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[4], vec_B[8] , vec_C[4];
|
||||
acc_t acc_0, acc_1;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
for (int l = 0; l < k; l+=8) {
|
||||
packNormal((A+(ii*lda)+l), lda, 4, 8, (uint8_t*)vec_A);
|
||||
packNormal((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B);
|
||||
for (int x = 0; x < 4; x++) {
|
||||
__builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvbf16ger2pp(&acc_1, vec_A[x], vec_B[x+4]);
|
||||
}
|
||||
}
|
||||
SAVE_ACC(&acc_0, ii, jj);
|
||||
SAVE_ACC(&acc_1, ii, jj+4);
|
||||
}
|
||||
|
||||
void KERNEL_8x4(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[8], vec_B[4] , vec_C[4];
|
||||
acc_t acc_0, acc_1;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
for (int l = 0; l < k; l+=8) {
|
||||
packNormal((A+(ii*lda)+l), lda, 8, 8, (uint8_t*)vec_A);
|
||||
packNormal((B+(jj*ldb)+l), ldb, 8, 4, (uint8_t*)vec_B);
|
||||
for (int x = 0; x < 4; x++) {
|
||||
__builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvbf16ger2pp(&acc_1, vec_A[x+4], vec_B[x]);
|
||||
}
|
||||
}
|
||||
SAVE_ACC(&acc_0, ii, jj);
|
||||
SAVE_ACC(&acc_1, ii+4, jj);
|
||||
}
|
||||
|
||||
|
||||
void KERNEL_8x8(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[8], vec_B[8], vec_C[4];
|
||||
acc_t acc_0, acc_1, acc_2, acc_3;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
__builtin_mma_xxsetaccz(&acc_2);
|
||||
__builtin_mma_xxsetaccz(&acc_3);
|
||||
for (int l = 0; l < k; l+=8) {
|
||||
packNormal(A+(ii*lda)+l, lda, 8, 8, (uint8_t*)vec_A);
|
||||
packNormal(B+(jj*ldb)+l, ldb, 8, 8, (uint8_t*)vec_B);
|
||||
for (int x = 0; x < 4; x++) {
|
||||
__builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvbf16ger2pp(&acc_1, (vec_t)vec_A[x], (vec_t)vec_B[x+4]);
|
||||
__builtin_mma_xvbf16ger2pp(&acc_2, (vec_t)vec_A[x+4], (vec_t)vec_B[x]);
|
||||
__builtin_mma_xvbf16ger2pp(&acc_3, (vec_t)vec_A[x+4], (vec_t)vec_B[x+4]);
|
||||
}
|
||||
}
|
||||
|
||||
SAVE_ACC(&acc_0, ii, jj);
|
||||
SAVE_ACC(&acc_1, ii, jj+4);
|
||||
SAVE_ACC(&acc_2, ii+4, jj);
|
||||
SAVE_ACC(&acc_3, ii+4, jj+4);
|
||||
}
|
||||
|
||||
template<int RM, int RN>
|
||||
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
vec_t vec_C[4];
|
||||
acc_t acc_0;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
vec_t vec_A[2], vec_B[2];
|
||||
for (int l=0; l<k; l+=4) {
|
||||
packNormal(A+(ii*lda)+l, lda, RM, 4, (uint8_t*)vec_A);
|
||||
packNormal(B+(jj*ldb)+l, ldb, RN, 4, (uint8_t*)vec_B);
|
||||
for (int x = 0; x<2; x++) {
|
||||
__builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
}
|
||||
}
|
||||
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < RN; J++) {
|
||||
*((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int RM>
|
||||
void gemm_Mx8(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int RN = 8;
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
vec_t vec_C[4];
|
||||
acc_t acc_0, acc_1;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
vec_t vec_A[4], vec_B[8];
|
||||
for (int l=0; l<k; l+=8) {
|
||||
packNormal(A+(ii*lda)+l, lda, RM, 8, (uint8_t*)vec_A);
|
||||
packNormal(B+(jj*ldb)+l, ldb, RN, 8, (uint8_t*)vec_B);
|
||||
for (int x = 0; x<4; x++) {
|
||||
__builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvbf16ger2pp(&acc_1, vec_A[x], vec_B[x+4]);
|
||||
}
|
||||
}
|
||||
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < 4; J++) {
|
||||
*((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
|
||||
}
|
||||
}
|
||||
__builtin_mma_disassemble_acc(vec_C, &acc_1);
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < 4; J++) {
|
||||
*((TC*)(C+ii+((jj+4+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int RM, int RN>
|
||||
inline void kernel(int64_t ii, int64_t jj) {
|
||||
if constexpr(RM == 4 && RN == 8) {
|
||||
KERNEL_4x8(ii,jj);
|
||||
} else if constexpr(RM == 8 && RN == 8) {
|
||||
KERNEL_8x8(ii,jj);
|
||||
} else if constexpr(RM == 8 && RN == 4) {
|
||||
KERNEL_8x4(ii,jj);
|
||||
} else {
|
||||
static_assert(false, "RN/RM values not supported");
|
||||
}
|
||||
}
|
||||
|
||||
template <int RM, int RN>
|
||||
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
kernel<RM, RN>(ii, jj);
|
||||
}
|
||||
}
|
||||
|
||||
const TA *const A;
|
||||
const TB *const B;
|
||||
TC *C;
|
||||
const int64_t k;
|
||||
const int64_t lda;
|
||||
const int64_t ldb;
|
||||
const int64_t ldc;
|
||||
const int ith;
|
||||
const int nth;
|
||||
};
|
||||
|
||||
template <typename TA, typename TB, typename TC>
|
||||
class tinyBLAS_Q0_PPC {
|
||||
public:
|
||||
@@ -2202,6 +2689,7 @@ class tinyBLAS_PPC {
|
||||
boffset = vec;
|
||||
j = (rows >> 3);
|
||||
if (j > 0) {
|
||||
|
||||
do {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
@@ -2875,9 +3363,22 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#elif defined(__MMA__)
|
||||
if ((k % 8))
|
||||
return false;
|
||||
if(Btype == GGML_TYPE_BF16) {
|
||||
tinyBLAS_BF16_PPC<ggml_bf16_t, ggml_bf16_t, float> tb{ k,
|
||||
(const ggml_bf16_t *)A, lda,
|
||||
(const ggml_bf16_t *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
case GGML_TYPE_F16: {
|
||||
#if defined(__AVX512F__)
|
||||
if (Btype == GGML_TYPE_F16) {
|
||||
|
||||
@@ -2636,6 +2636,7 @@ static __global__ void mul_mat_q(
|
||||
|
||||
ids_dst_shared[j] = j;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
||||
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
||||
@@ -2664,6 +2665,7 @@ static __global__ void mul_mat_q(
|
||||
return;
|
||||
}
|
||||
|
||||
// __syncthreads(); // There is no previous tile that could cause a race condition.
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
|
||||
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
|
||||
@@ -2674,6 +2676,7 @@ static __global__ void mul_mat_q(
|
||||
|
||||
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
|
||||
@@ -2740,6 +2743,7 @@ static __global__ void mul_mat_q(
|
||||
continue;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
|
||||
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
|
||||
@@ -2750,6 +2754,7 @@ static __global__ void mul_mat_q(
|
||||
|
||||
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
|
||||
@@ -2805,6 +2810,7 @@ static __global__ void mul_mat_q(
|
||||
}
|
||||
|
||||
// The memory layout for the fixup buffer is always contiguous, therefore reset ids:
|
||||
__syncthreads();
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
|
||||
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
|
||||
@@ -2815,6 +2821,7 @@ static __global__ void mul_mat_q(
|
||||
|
||||
ids_dst_shared[j] = j;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
|
||||
|
||||
@@ -340,11 +340,17 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_acc_f32;
|
||||
vk_pipeline pipeline_add_f32, pipeline_add_f32_norepeat;
|
||||
vk_pipeline pipeline_add_f16_f32_f16, pipeline_add_f16_f32_f16_norepeat;
|
||||
vk_pipeline pipeline_sub_f32, pipeline_sub_f32_norepeat;
|
||||
vk_pipeline pipeline_mul_f32, pipeline_mul_f32_norepeat;
|
||||
vk_pipeline pipeline_div_f32, pipeline_div_f32_norepeat;
|
||||
|
||||
// [src0 0=fp32,1=fp16][src1 0=fp32,1=fp16][dst 0=fp32,1=fp16]
|
||||
vk_pipeline pipeline_add[2][2][2];
|
||||
vk_pipeline pipeline_add_norepeat[2][2][2];
|
||||
vk_pipeline pipeline_sub[2][2][2];
|
||||
vk_pipeline pipeline_sub_norepeat[2][2][2];
|
||||
vk_pipeline pipeline_mul[2][2][2];
|
||||
vk_pipeline pipeline_mul_norepeat[2][2][2];
|
||||
vk_pipeline pipeline_div[2][2][2];
|
||||
vk_pipeline pipeline_div_norepeat[2][2][2];
|
||||
|
||||
vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
|
||||
vk_pipeline pipeline_upscale_f32;
|
||||
vk_pipeline pipeline_scale_f32;
|
||||
@@ -354,8 +360,8 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_clamp_f32;
|
||||
vk_pipeline pipeline_pad_f32;
|
||||
vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32;
|
||||
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f32_bf16;
|
||||
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f32_bf16;
|
||||
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16;
|
||||
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16;
|
||||
vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_norm_f32;
|
||||
@@ -363,14 +369,17 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_rms_norm_f32;
|
||||
vk_pipeline pipeline_rms_norm_back_f32;
|
||||
vk_pipeline pipeline_l2_norm_f32;
|
||||
vk_pipeline pipeline_gelu_f32;
|
||||
vk_pipeline pipeline_gelu_quick_f32;
|
||||
vk_pipeline pipeline_silu_f32;
|
||||
vk_pipeline pipeline_silu_back_f32;
|
||||
vk_pipeline pipeline_relu_f32;
|
||||
|
||||
// [src/dst 0=fp32,1=fp16]
|
||||
vk_pipeline pipeline_gelu[2];
|
||||
vk_pipeline pipeline_gelu_quick[2];
|
||||
vk_pipeline pipeline_silu[2];
|
||||
vk_pipeline pipeline_relu[2];
|
||||
vk_pipeline pipeline_tanh[2];
|
||||
vk_pipeline pipeline_sigmoid[2];
|
||||
|
||||
vk_pipeline pipeline_leaky_relu_f32;
|
||||
vk_pipeline pipeline_tanh_f32;
|
||||
vk_pipeline pipeline_sigmoid_f32;
|
||||
vk_pipeline pipeline_silu_back_f32;
|
||||
vk_pipeline pipeline_diag_mask_inf_f32;
|
||||
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
||||
vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512;
|
||||
@@ -389,6 +398,8 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_rwkv_wkv6_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv7_f32;
|
||||
vk_pipeline pipeline_opt_step_adamw_f32;
|
||||
vk_pipeline pipeline_conv2d_dw_whcn_f32;
|
||||
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
|
||||
|
||||
// [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned}
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2];
|
||||
@@ -701,6 +712,24 @@ struct vk_op_rwkv_wkv7_push_constants {
|
||||
uint32_t H;
|
||||
};
|
||||
|
||||
struct vk_op_conv2d_dw_push_constants {
|
||||
uint32_t ne;
|
||||
uint32_t batches;
|
||||
uint32_t channels;
|
||||
uint32_t dst_w;
|
||||
uint32_t dst_h;
|
||||
uint32_t src_w;
|
||||
uint32_t src_h;
|
||||
uint32_t knl_w;
|
||||
uint32_t knl_h;
|
||||
int32_t stride_x;
|
||||
int32_t stride_y;
|
||||
int32_t pad_x;
|
||||
int32_t pad_y;
|
||||
int32_t dilation_x;
|
||||
int32_t dilation_y;
|
||||
};
|
||||
|
||||
struct vk_op_upscale_push_constants {
|
||||
uint32_t ne; uint32_t a_offset; uint32_t d_offset;
|
||||
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||
@@ -2488,11 +2517,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f32, "cpy_f16_f32", cpy_f16_f32_len, cpy_f16_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_bf16,"cpy_f32_bf16",cpy_f32_bf16_len,cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f32, "contig_cpy_f16_f32", contig_cpy_f16_f32_len, contig_cpy_f16_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_bf16,"contig_cpy_f32_bf16",contig_cpy_f32_bf16_len,contig_cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
if (device->float_controls_rte_fp16) {
|
||||
@@ -2518,20 +2549,32 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q8_0], "cpy_q8_0_f32", cpy_q8_0_f32_len, cpy_q8_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_IQ4_NL], "cpy_iq4_nl_f32", cpy_iq4_nl_f32_len, cpy_iq4_nl_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add_f32_norepeat, "add_f32_norepeat", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16_norepeat, "add_f16_f32_f16_norepeat", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
|
||||
auto get_suffix = [](bool src0_f16, bool src1_f16, bool dst_f16) {
|
||||
std::string s;
|
||||
s += std::string(src0_f16 ? "_f16" : "_f32");
|
||||
s += std::string(src1_f16 ? "_f16" : "_f32");
|
||||
s += std::string(dst_f16 ? "_f16" : "_f32");
|
||||
return s;
|
||||
};
|
||||
|
||||
#define CREATE_BINARY(name, namemod, spec) \
|
||||
for (int s0 : {0,1}) for (int s1 : {0,1}) for (int d : {0,1}) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name ## namemod[s0][s1][d], \
|
||||
#name + get_suffix(s0, s1, d) + #namemod, name ## _len[s0][s1][d], name ## _data[s0][s1][d], \
|
||||
"main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, spec, 1);
|
||||
|
||||
CREATE_BINARY(add, , {0})
|
||||
CREATE_BINARY(add, _norepeat, {1})
|
||||
CREATE_BINARY(sub, , {0})
|
||||
CREATE_BINARY(sub, _norepeat, {1})
|
||||
CREATE_BINARY(mul, , {0})
|
||||
CREATE_BINARY(mul, _norepeat, {1})
|
||||
CREATE_BINARY(div, , {0})
|
||||
CREATE_BINARY(div, _norepeat, {1})
|
||||
#undef CREATE_BINARY
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_sub_f32, "sub_f32", sub_f32_len, sub_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_sub_f32_norepeat, "sub_f32_norepeat", sub_f32_len, sub_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_mul_f32_norepeat, "mul_f32_norepeat", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_div_f32_norepeat, "div_f32_norepeat", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
@@ -2551,14 +2594,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_gelu_quick_f32, "gelu_quick_f32", gelu_quick_f32_len, gelu_quick_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_silu_back_f32, "silu_back_f32", silu_back_f32_len, silu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
#define CREATE_UNARY(name) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
CREATE_UNARY(gelu)
|
||||
CREATE_UNARY(gelu_quick)
|
||||
CREATE_UNARY(silu)
|
||||
CREATE_UNARY(relu)
|
||||
CREATE_UNARY(tanh)
|
||||
CREATE_UNARY(sigmoid)
|
||||
#undef CREATE_UNARY
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_tanh_f32, "tanh_f32", tanh_f32_len, tanh_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_sigmoid_f32, "sigmoid_f32", sigmoid_f32_len, sigmoid_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_silu_back_f32, "silu_back_f32", silu_back_f32_len, silu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {1, 512, 1}, {}, 1, true);
|
||||
|
||||
@@ -2610,6 +2659,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
for (auto &c : compiles) {
|
||||
c.wait();
|
||||
}
|
||||
@@ -4481,6 +4533,13 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_cpy_f16_f16;
|
||||
}
|
||||
}
|
||||
if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F32) {
|
||||
if (contig) {
|
||||
return ctx->device->pipeline_contig_cpy_f16_f32;
|
||||
} else {
|
||||
return ctx->device->pipeline_cpy_f16_f32;
|
||||
}
|
||||
}
|
||||
if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_BF16) {
|
||||
if (contig) {
|
||||
return ctx->device->pipeline_contig_cpy_f32_bf16;
|
||||
@@ -5871,26 +5930,37 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_ADD:
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_f32_norepeat : ctx->device->pipeline_add_f32;
|
||||
}
|
||||
if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
||||
return ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_f16_f32_f16_norepeat : ctx->device->pipeline_add_f16_f32_f16;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_SUB:
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_sub_f32_norepeat : ctx->device->pipeline_sub_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_MUL:
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_mul_f32_norepeat : ctx->device->pipeline_mul_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_DIV:
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_div_f32_norepeat : ctx->device->pipeline_div_f32;
|
||||
if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) ||
|
||||
(src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) ||
|
||||
(dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16)) {
|
||||
return nullptr;
|
||||
}
|
||||
switch (op) {
|
||||
case GGML_OP_ADD:
|
||||
{
|
||||
auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_norepeat : ctx->device->pipeline_add;
|
||||
return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16];
|
||||
}
|
||||
case GGML_OP_SUB:
|
||||
{
|
||||
auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_sub_norepeat : ctx->device->pipeline_sub;
|
||||
return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16];
|
||||
}
|
||||
case GGML_OP_MUL:
|
||||
{
|
||||
auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_mul_norepeat : ctx->device->pipeline_mul;
|
||||
return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16];
|
||||
}
|
||||
case GGML_OP_DIV:
|
||||
{
|
||||
auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_div_norepeat : ctx->device->pipeline_div;
|
||||
return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16];
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_CONCAT:
|
||||
@@ -5984,37 +6054,25 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_UNARY:
|
||||
if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) ||
|
||||
(dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) ||
|
||||
(src0->type != dst->type)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
switch (ggml_get_unary_op(dst)) {
|
||||
case GGML_UNARY_OP_SILU:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_silu_f32;
|
||||
}
|
||||
break;
|
||||
return ctx->device->pipeline_silu[dst->type == GGML_TYPE_F16];
|
||||
case GGML_UNARY_OP_GELU:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_gelu_f32;
|
||||
}
|
||||
break;
|
||||
return ctx->device->pipeline_gelu[dst->type == GGML_TYPE_F16];
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_gelu_quick_f32;
|
||||
}
|
||||
break;
|
||||
return ctx->device->pipeline_gelu_quick[dst->type == GGML_TYPE_F16];
|
||||
case GGML_UNARY_OP_RELU:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_relu_f32;
|
||||
}
|
||||
break;
|
||||
return ctx->device->pipeline_relu[dst->type == GGML_TYPE_F16];
|
||||
case GGML_UNARY_OP_TANH:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_tanh_f32;
|
||||
}
|
||||
break;
|
||||
return ctx->device->pipeline_tanh[dst->type == GGML_TYPE_F16];
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_sigmoid_f32;
|
||||
}
|
||||
break;
|
||||
return ctx->device->pipeline_sigmoid[dst->type == GGML_TYPE_F16];
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -6137,6 +6195,15 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_leaky_relu_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
if (ggml_is_contiguous(src1)) {
|
||||
return ctx->device->pipeline_conv2d_dw_whcn_f32;
|
||||
} else if (ggml_is_contiguous_channels(src1)) {
|
||||
return ctx->device->pipeline_conv2d_dw_cwhn_f32;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
@@ -6163,6 +6230,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -6459,6 +6527,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_UNARY:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
{
|
||||
const uint32_t ne = ggml_nelements(dst);
|
||||
if (ne > 262144) {
|
||||
@@ -7245,6 +7314,30 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
vk_op_conv2d_dw_push_constants p{};
|
||||
p.ne = ggml_nelements(dst);
|
||||
p.channels = dst->ne[2];
|
||||
p.batches = dst->ne[3];
|
||||
p.dst_w = dst->ne[0];
|
||||
p.dst_h = dst->ne[1];
|
||||
p.src_w = src1->ne[0];
|
||||
p.src_h = src1->ne[1];
|
||||
p.knl_w = src0->ne[0];
|
||||
p.knl_h = src0->ne[1];
|
||||
p.stride_x = dst->op_params[0];
|
||||
p.stride_y = dst->op_params[1];
|
||||
p.pad_x = dst->op_params[2];
|
||||
p.pad_y = dst->op_params[3];
|
||||
p.dilation_x = dst->op_params[4];
|
||||
p.dilation_y = dst->op_params[5];
|
||||
|
||||
GGML_ASSERT(src0->ne[3] == p.channels);
|
||||
GGML_ASSERT(src1->ne[3] == p.batches);
|
||||
|
||||
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D_DW, std::move(p), dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
const float * op_params = (const float *)dst->op_params;
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
|
||||
@@ -8265,6 +8358,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
@@ -8328,6 +8422,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
{
|
||||
// These operations all go through ggml_vk_op_f32, so short-circuit and
|
||||
@@ -8501,6 +8596,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
case GGML_OP_POOL_2D:
|
||||
ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun);
|
||||
@@ -8622,6 +8721,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
@@ -9358,7 +9458,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
return ggml_is_contiguous(op->src[0]) &&
|
||||
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
(op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
|
||||
(op->src[0]->type == op->type);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -9538,6 +9641,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
}
|
||||
if (src1_type == GGML_TYPE_F32) {
|
||||
switch (src0_type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
@@ -9576,6 +9680,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
(op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16) &&
|
||||
(op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
|
||||
case GGML_OP_SILU_BACK:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
case GGML_OP_SQR:
|
||||
@@ -9599,6 +9706,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
|
||||
105
ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
Normal file
105
ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
Normal file
@@ -0,0 +1,105 @@
|
||||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint ne;
|
||||
uint batches;
|
||||
uint channels;
|
||||
uint dst_w;
|
||||
uint dst_h;
|
||||
uint src_w;
|
||||
uint src_h;
|
||||
uint knl_w;
|
||||
uint knl_h;
|
||||
int stride_x;
|
||||
int stride_y;
|
||||
int pad_x;
|
||||
int pad_y;
|
||||
int dilation_x;
|
||||
int dilation_y;
|
||||
} p;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE knl_data[];};
|
||||
layout (binding = 1) readonly buffer B {B_TYPE src_data[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE dst_data[];};
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
FLOAT_TYPE conv_2d_dw_whcn(uint idx) {
|
||||
uint i0 = idx / p.dst_w;
|
||||
uint dst_x = idx - i0 * p.dst_w;
|
||||
uint i1 = i0 / p.dst_h;
|
||||
uint dst_y = i0 - i1 * p.dst_h;
|
||||
uint n = i1 / p.channels;
|
||||
uint c = i1 - n * p.channels;
|
||||
|
||||
uint src_i = n * p.channels * p.src_h * p.src_w + c * p.src_h * p.src_w;
|
||||
uint knl_i = c * p.knl_h * p.knl_w;
|
||||
|
||||
FLOAT_TYPE sum = 0.0;
|
||||
for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
|
||||
uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
|
||||
if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
|
||||
continue;
|
||||
}
|
||||
for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
|
||||
uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
|
||||
if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
|
||||
continue;
|
||||
}
|
||||
FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]);
|
||||
FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]);
|
||||
sum = fma(v, k, sum);
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
FLOAT_TYPE conv_2d_dw_cwhn(uint idx) {
|
||||
uint i0 = idx / p.channels;
|
||||
uint c = idx - i0 * p.channels;
|
||||
uint i1 = i0 / p.dst_w;
|
||||
uint dst_x = i0 - i1 * p.dst_w;
|
||||
uint n = i1 / p.dst_h;
|
||||
uint dst_y = i1 - n * p.dst_h;
|
||||
|
||||
uint src_i = n * p.channels * p.src_h * p.src_w;
|
||||
uint src_row = p.src_w * p.channels;
|
||||
uint knl_row = p.knl_w * p.channels;
|
||||
|
||||
FLOAT_TYPE sum = 0.0;
|
||||
for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
|
||||
uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
|
||||
if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
|
||||
continue;
|
||||
}
|
||||
for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
|
||||
uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
|
||||
if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
|
||||
continue;
|
||||
}
|
||||
FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]);
|
||||
FLOAT_TYPE k = FLOAT_TYPE(knl_data[ knl_y * knl_row + knl_x * p.channels + c]);
|
||||
sum = fma(v, k, sum);
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
void main() {
|
||||
uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLOAT_TYPE result =
|
||||
#ifdef WHCN
|
||||
conv_2d_dw_whcn(idx);
|
||||
#else
|
||||
conv_2d_dw_cwhn(idx);
|
||||
#endif
|
||||
dst_data[idx] = D_TYPE(result);
|
||||
}
|
||||
|
||||
@@ -17,5 +17,5 @@ void main() {
|
||||
return;
|
||||
}
|
||||
|
||||
data_d[i] = max(float(data_a[i]), 0);
|
||||
data_d[i] = D_TYPE(max(float(data_a[i]), 0));
|
||||
}
|
||||
|
||||
@@ -16,5 +16,5 @@ void main() {
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
data_d[i] = D_TYPE(1. / (1 + exp(-1. *data_a[i])));
|
||||
data_d[i] = D_TYPE(1. / (1 + exp(-1. * float(data_a[i]))));
|
||||
}
|
||||
|
||||
@@ -16,5 +16,5 @@ void main() {
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
data_d[i] = D_TYPE(1. - 2. / (exp(2.*data_a[i]) + 1.));
|
||||
data_d[i] = D_TYPE(1. - 2. / (exp(2.*float(data_a[i])) + 1.));
|
||||
}
|
||||
|
||||
@@ -485,10 +485,12 @@ void process_shaders() {
|
||||
string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("cpy_f16_f32", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("cpy_f32_bf16","copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "uint16_t"}, {"DATA_D_BF16", "1"}});
|
||||
string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("contig_cpy_f16_f32", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("contig_cpy_f32_bf16","contig_copy.comp",{{"A_TYPE", "float"}, {"D_TYPE", "uint16_t"}, {"DATA_D_BF16", "1"}});
|
||||
|
||||
for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
|
||||
@@ -497,8 +499,26 @@ void process_shaders() {
|
||||
string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}
|
||||
|
||||
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
|
||||
auto get_type_str = [](bool f16) {
|
||||
return f16 ? "float16_t" : "float";
|
||||
};
|
||||
auto get_suffix = [](bool src0_f16, bool src1_f16, bool dst_f16) {
|
||||
std::string s;
|
||||
s += std::string(src0_f16 ? "_f16" : "_f32");
|
||||
s += std::string(src1_f16 ? "_f16" : "_f32");
|
||||
s += std::string(dst_f16 ? "_f16" : "_f32");
|
||||
return s;
|
||||
};
|
||||
for (std::string op : {"add", "sub", "mul", "div"}) {
|
||||
for (auto src0_f16 : {false, true}) {
|
||||
for (auto src1_f16 : {false, true}) {
|
||||
for (auto dst_f16 : {false, true}) {
|
||||
auto name = op + get_suffix(src0_f16, src1_f16, dst_f16);
|
||||
string_to_spv(name.c_str(), op + ".comp", {{"A_TYPE", get_type_str(src0_f16)}, {"B_TYPE", get_type_str(src1_f16)}, {"D_TYPE", get_type_str(dst_f16)}, {"FLOAT_TYPE", "float"}});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
string_to_spv("sub_f32", "sub.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
@@ -533,14 +553,21 @@ void process_shaders() {
|
||||
|
||||
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("gelu_f16", "gelu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("gelu_quick_f16", "gelu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("silu_f16", "silu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("relu_f16", "relu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("tanh_f16", "tanh.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("sigmoid_f16", "sigmoid.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
@@ -584,6 +611,9 @@ void process_shaders() {
|
||||
|
||||
string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
|
||||
string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
|
||||
|
||||
for (auto &c : compiles) {
|
||||
c.wait();
|
||||
}
|
||||
@@ -638,7 +668,12 @@ void write_output_files() {
|
||||
std::remove(path.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
for (const char *op : {"add", "sub", "mul", "div"}) {
|
||||
fprintf(hdr, "extern unsigned char *%s_data[2][2][2];\n", op);
|
||||
fprintf(hdr, "extern uint64_t %s_len[2][2][2];\n", op);
|
||||
fprintf(src, "unsigned char *%s_data[2][2][2] = {{{%s_f32_f32_f32_data, %s_f32_f32_f16_data}, {%s_f32_f16_f32_data, %s_f32_f16_f16_data}}, {{%s_f16_f32_f32_data, %s_f16_f32_f16_data}, {%s_f16_f16_f32_data, %s_f16_f16_f16_data}}};\n", op, op, op, op, op, op, op, op, op);
|
||||
fprintf(src, "uint64_t %s_len[2][2][2] = {{{%s_f32_f32_f32_len, %s_f32_f32_f16_len}, {%s_f32_f16_f32_len, %s_f32_f16_f16_len}}, {{%s_f16_f32_f32_len, %s_f16_f32_f16_len}, {%s_f16_f16_f32_len, %s_f16_f16_f16_len}}};\n", op, op, op, op, op, op, op, op, op);
|
||||
}
|
||||
fclose(hdr);
|
||||
fclose(src);
|
||||
}
|
||||
|
||||
@@ -234,6 +234,7 @@ class Keys:
|
||||
SPATIAL_MERGE_SIZE = "clip.vision.spatial_merge_size"
|
||||
USE_GELU = "clip.use_gelu"
|
||||
USE_SILU = "clip.use_silu"
|
||||
N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "clip.vision.attention.head_count"
|
||||
@@ -2032,6 +2033,8 @@ class PoolingType(IntEnum):
|
||||
NONE = 0
|
||||
MEAN = 1
|
||||
CLS = 2
|
||||
LAST = 3
|
||||
RANK = 4
|
||||
|
||||
|
||||
class GGMLQuantizationType(IntEnum):
|
||||
@@ -2162,6 +2165,8 @@ class VisionProjectorType:
|
||||
GEMMA3 = "gemma3"
|
||||
IDEFICS3 = "idefics3"
|
||||
PIXTRAL = "pixtral"
|
||||
QWEN2VL = "qwen2vl_merger"
|
||||
QWEN25VL = "qwen2.5vl_merger"
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
|
||||
@@ -984,6 +984,9 @@ class GGUFWriter:
|
||||
def add_vision_projector_scale_factor(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
|
||||
|
||||
def add_vision_n_wa_pattern(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)
|
||||
|
||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||
pack_prefix = ''
|
||||
if not skip_pack_prefix:
|
||||
|
||||
@@ -896,6 +896,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ: (
|
||||
"multi_modal_projector.linear_{bid}",
|
||||
"visual.merger.mlp.{bid}", # qwen2vl
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_FC: (
|
||||
@@ -919,6 +920,7 @@ class TensorNameMap:
|
||||
"vpm.embeddings.patch_embedding",
|
||||
"model.vision_model.embeddings.patch_embedding", # SmolVLM
|
||||
"vision_tower.patch_conv", # pixtral
|
||||
"visual.patch_embed.proj", # qwen2vl
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_POS: (
|
||||
@@ -932,6 +934,7 @@ class TensorNameMap:
|
||||
"vpm.encoder.layers.{bid}.self_attn.q_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
|
||||
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral
|
||||
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_K: (
|
||||
@@ -939,6 +942,7 @@ class TensorNameMap:
|
||||
"vpm.encoder.layers.{bid}.self_attn.k_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
|
||||
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral
|
||||
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_V: (
|
||||
@@ -946,6 +950,7 @@ class TensorNameMap:
|
||||
"vpm.encoder.layers.{bid}.self_attn.v_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
|
||||
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral
|
||||
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
||||
@@ -953,6 +958,7 @@ class TensorNameMap:
|
||||
"vpm.encoder.layers.{bid}.layer_norm1",
|
||||
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
|
||||
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
|
||||
"visual.blocks.{bid}.norm1", # qwen2vl
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_OUTPUT: (
|
||||
@@ -960,6 +966,7 @@ class TensorNameMap:
|
||||
"vpm.encoder.layers.{bid}.self_attn.out_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
|
||||
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral
|
||||
"visual.blocks.{bid}.attn.proj", # qwen2vl
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
|
||||
@@ -967,17 +974,24 @@ class TensorNameMap:
|
||||
"vpm.encoder.layers.{bid}.layer_norm2",
|
||||
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
|
||||
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral
|
||||
"visual.blocks.{bid}.norm2", # qwen2vl
|
||||
),
|
||||
|
||||
# some namings are messed up because the original llava code swapped fc1 and fc2
|
||||
# we have no better way to fix it, just be careful
|
||||
# new models like pixtral use the correct naming
|
||||
MODEL_TENSOR.V_ENC_FFN_UP: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
|
||||
"vpm.encoder.layers.{bid}.mlp.fc1",
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 (note: name is swapped)
|
||||
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral
|
||||
"visual.blocks.{bid}.mlp.fc2", # qwen2vl
|
||||
"visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_GATE: (
|
||||
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral
|
||||
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN: (
|
||||
@@ -985,6 +999,8 @@ class TensorNameMap:
|
||||
"vpm.encoder.layers.{bid}.mlp.fc2",
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 (note: name is swapped)
|
||||
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral
|
||||
"visual.blocks.{bid}.mlp.fc1", # qwen2vl
|
||||
"visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_PRE_NORM: (
|
||||
@@ -995,6 +1011,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.V_POST_NORM: (
|
||||
"vision_tower.vision_model.post_layernorm",
|
||||
"model.vision_model.post_layernorm", # SmolVLM
|
||||
"visual.merger.ln_q", # qwen2vl
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_INP_PROJ: (
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# GBNF Guide
|
||||
|
||||
GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/server`.
|
||||
GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `tools/main` and `tools/server`.
|
||||
|
||||
## Background
|
||||
|
||||
@@ -110,21 +110,21 @@ While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) ma
|
||||
|
||||
You can use GBNF grammars:
|
||||
|
||||
- In [llama-server](../examples/server)'s completion endpoints, passed as the `grammar` body field
|
||||
- In [llama-cli](../examples/main), passed as the `--grammar` & `--grammar-file` flags
|
||||
- In [llama-server](../tools/server)'s completion endpoints, passed as the `grammar` body field
|
||||
- In [llama-cli](../tools/main), passed as the `--grammar` & `--grammar-file` flags
|
||||
- With [test-gbnf-validator](../tests/test-gbnf-validator.cpp), to test them against strings.
|
||||
|
||||
## JSON Schemas → GBNF
|
||||
|
||||
`llama.cpp` supports converting a subset of https://json-schema.org/ to GBNF grammars:
|
||||
|
||||
- In [llama-server](../examples/server):
|
||||
- In [llama-server](../tools/server):
|
||||
- For any completion endpoints, passed as the `json_schema` body field
|
||||
- For the `/chat/completions` endpoint, passed inside the `response_format` body field (e.g. `{"type", "json_object", "schema": {"items": {}}}` or `{ type: "json_schema", json_schema: {"schema": ...} }`)
|
||||
- In [llama-cli](../examples/main), passed as the `--json` / `-j` flag
|
||||
- In [llama-cli](../tools/main), passed as the `--json` / `-j` flag
|
||||
- To convert to a grammar ahead of time:
|
||||
- in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
|
||||
- in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
|
||||
- in JavaScript with [json-schema-to-grammar.mjs](../tools/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../tools/server)'s Web UI)
|
||||
|
||||
Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggml-org/llama.cpp/pull/5978, https://github.com/ggml-org/llama.cpp/pull/6659 & https://github.com/ggml-org/llama.cpp/pull/6555).
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
},
|
||||
{
|
||||
// uses match expressions in steps.py
|
||||
"root": "examples/server/tests",
|
||||
"root": "tools/server/tests",
|
||||
"pythonVersion": "3.10",
|
||||
},
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
-r ../examples/llava/requirements.txt
|
||||
-r ../examples/server/bench/requirements.txt
|
||||
-r ../examples/server/tests/requirements.txt
|
||||
-r ../tools/llava/requirements.txt
|
||||
-r ../tools/server/bench/requirements.txt
|
||||
-r ../tools/server/tests/requirements.txt
|
||||
|
||||
-r ./requirements-compare-llama-bench.txt
|
||||
-r ./requirements-pydantic.txt
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
Example:
|
||||
python scripts/fetch_server_test_models.py
|
||||
( cd examples/server/tests && ./tests.sh -v -x -m slow )
|
||||
( cd tools/server/tests && ./tests.sh -v -x -m slow )
|
||||
'''
|
||||
import ast
|
||||
import glob
|
||||
@@ -66,7 +66,7 @@ if __name__ == '__main__':
|
||||
|
||||
models = sorted(list(set([
|
||||
model
|
||||
for test_file in glob.glob('examples/server/tests/unit/test_*.py')
|
||||
for test_file in glob.glob('tools/server/tests/unit/test_*.py')
|
||||
for model in collect_hf_model_test_parameters(test_file)
|
||||
])), key=lambda m: (m.hf_repo, m.hf_file))
|
||||
|
||||
|
||||
@@ -1 +1 @@
|
||||
f3a375f20bf56860b30e7c511d03593a1e393345
|
||||
0482de9c63b9134eb462c7732888c0ee0dbc2755
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
'''
|
||||
Simplistic tool call benchmarks for llama-server and ollama.
|
||||
|
||||
Essentially runs the tests at server/examples/server/tests/unit/test_tool_call.py N times, at different temperatures and on different backends (current llama-server, baseline llama-server and ollama),
|
||||
Essentially runs the tests at server/tools/server/tests/unit/test_tool_call.py N times, at different temperatures and on different backends (current llama-server, baseline llama-server and ollama),
|
||||
and plots the results of multiple runs (from same .jsonl file or multiple ones) as a success rate heatmap.
|
||||
|
||||
Simple usage example:
|
||||
@@ -51,8 +51,8 @@ import typer
|
||||
|
||||
sys.path.insert(0, Path(__file__).parent.parent.as_posix())
|
||||
if True:
|
||||
from examples.server.tests.utils import ServerProcess
|
||||
from examples.server.tests.unit.test_tool_call import TIMEOUT_SERVER_START, do_test_calc_result, do_test_hello_world, do_test_weather
|
||||
from tools.server.tests.utils import ServerProcess
|
||||
from tools.server.tests.unit.test_tool_call import TIMEOUT_SERVER_START, do_test_calc_result, do_test_hello_world, do_test_weather
|
||||
|
||||
|
||||
@contextmanager
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# CMake equivalent of `xxd -i ${INPUT} ${OUTPUT}`
|
||||
# Usage: cmake -DINPUT=examples/server/public/index.html -DOUTPUT=examples/server/index.html.hpp -P scripts/xxd.cmake
|
||||
# Usage: cmake -DINPUT=tools/server/public/index.html -DOUTPUT=tools/server/index.html.hpp -P scripts/xxd.cmake
|
||||
|
||||
SET(INPUT "" CACHE STRING "Input File")
|
||||
SET(OUTPUT "" CACHE STRING "Output File")
|
||||
|
||||
@@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
|
||||
return ubatch;
|
||||
}
|
||||
|
||||
void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
||||
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
||||
GGML_ASSERT(batch.n_tokens >= 0);
|
||||
this->batch = &batch;
|
||||
this->n_embd = n_embd;
|
||||
@@ -203,6 +203,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
||||
for (size_t i = 0; i < n_tokens; ++i) {
|
||||
ids[i] = i;
|
||||
}
|
||||
|
||||
if (simple_split) {
|
||||
seq.resize(1);
|
||||
llama_sbatch_seq & s = seq[0];
|
||||
@@ -212,6 +213,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
||||
s.length = n_tokens;
|
||||
return;
|
||||
}
|
||||
|
||||
std::sort(ids.begin(), ids.end(),
|
||||
[&batch](size_t a, size_t b) {
|
||||
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
|
||||
@@ -239,6 +241,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
||||
return n_seq_a > n_seq_b;
|
||||
}
|
||||
);
|
||||
|
||||
// init seq
|
||||
llama_sbatch_seq * last_seq = nullptr;
|
||||
|
||||
@@ -262,6 +265,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
||||
seq.push_back(new_seq);
|
||||
last_seq = &seq.back();
|
||||
}
|
||||
|
||||
// keep shared prompts first at the end, then sort by length descending.
|
||||
std::sort(seq.begin(), seq.end(),
|
||||
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
|
||||
|
||||
@@ -70,7 +70,8 @@ struct llama_sbatch {
|
||||
// sequence-wise split
|
||||
llama_ubatch split_seq(size_t n_ubatch);
|
||||
|
||||
void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
||||
llama_sbatch() = default;
|
||||
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
||||
};
|
||||
|
||||
// temporary allocate memory for the input batch if needed
|
||||
|
||||
@@ -447,7 +447,7 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
|
||||
ss << "[gMASK]" << "<sop>";
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
@@ -456,6 +456,14 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>\n";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<|" << role << "|>" << "\n" << message->content;
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
|
||||
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||
for (auto message : chat) {
|
||||
|
||||
@@ -6,11 +6,9 @@
|
||||
#include "llama-model.h"
|
||||
#include "llama-kv-cache.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <stdexcept>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
|
||||
//
|
||||
// llama_context
|
||||
@@ -177,44 +175,13 @@ llama_context::llama_context(
|
||||
}
|
||||
|
||||
// init the memory module
|
||||
// TODO: for now, always create a unified KV cache
|
||||
if (!hparams.vocab_only) {
|
||||
kv_self.reset(static_cast<llama_kv_cache_unified *>(model.create_memory()));
|
||||
llama_memory_params params_mem = {
|
||||
/*.type_k =*/ params.type_k,
|
||||
/*.type_v =*/ params.type_v,
|
||||
};
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
||||
|
||||
cparams.n_ctx = GGML_PAD(cparams.n_ctx, kv_self->get_padding(cparams));
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
||||
|
||||
uint32_t kv_size = cparams.n_ctx;
|
||||
ggml_type type_k = params.type_k;
|
||||
ggml_type type_v = params.type_v;
|
||||
|
||||
if (llama_model_is_recurrent(&model)) {
|
||||
// Mamba needs at least as many KV cells as there are sequences kept at any time
|
||||
kv_size = std::max((uint32_t) 1, params.n_seq_max);
|
||||
// it's probably best to keep as much precision as possible for the states
|
||||
type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
|
||||
type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
|
||||
}
|
||||
|
||||
GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
|
||||
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
|
||||
|
||||
if (!kv_self->init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
||||
throw std::runtime_error("failed to initialize self-attention cache");
|
||||
}
|
||||
|
||||
{
|
||||
const size_t memory_size_k = kv_self->size_k_bytes();
|
||||
const size_t memory_size_v = kv_self->size_v_bytes();
|
||||
|
||||
LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
||||
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
|
||||
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
||||
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
||||
}
|
||||
memory.reset(model.create_memory(params_mem, cparams));
|
||||
}
|
||||
|
||||
// init backends
|
||||
@@ -305,7 +272,9 @@ llama_context::llama_context(
|
||||
int n_nodes_tg = -1;
|
||||
|
||||
// simulate full KV cache
|
||||
kv_self->n = kv_self->size;
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
kv_self->set_full();
|
||||
|
||||
cross.v_embd.clear();
|
||||
|
||||
@@ -427,6 +396,18 @@ const llama_model & llama_context::get_model() const {
|
||||
return model;
|
||||
}
|
||||
|
||||
const llama_cparams & llama_context::get_cparams() const {
|
||||
return cparams;
|
||||
}
|
||||
|
||||
ggml_backend_sched_t llama_context::get_sched() const {
|
||||
return sched.get();
|
||||
}
|
||||
|
||||
ggml_context * llama_context::get_ctx_compute() const {
|
||||
return ctx_compute.get();
|
||||
}
|
||||
|
||||
uint32_t llama_context::n_ctx() const {
|
||||
return cparams.n_ctx;
|
||||
}
|
||||
@@ -456,337 +437,21 @@ uint32_t llama_context::n_threads_batch() const {
|
||||
}
|
||||
|
||||
llama_kv_cache * llama_context::get_kv_self() {
|
||||
return kv_self.get();
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
return kv_self;
|
||||
}
|
||||
|
||||
const llama_kv_cache * llama_context::get_kv_self() const {
|
||||
return kv_self.get();
|
||||
}
|
||||
|
||||
ggml_tensor * llama_context::build_rope_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale) const {
|
||||
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
|
||||
|
||||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
|
||||
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const auto & n_rot = hparams.n_rot;
|
||||
const auto & rope_type = hparams.rope_type;
|
||||
|
||||
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
|
||||
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
||||
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
|
||||
|
||||
ggml_tensor * tmp;
|
||||
|
||||
if (ggml_is_quantized(cur->type)) {
|
||||
// dequantize to f32 -> RoPE -> quantize back
|
||||
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
|
||||
|
||||
tmp = ggml_rope_ext(ctx0, tmp,
|
||||
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
||||
|
||||
tmp = ggml_cpy(ctx0, tmp, cur);
|
||||
} else {
|
||||
// we rotate only the first n_rot dimensions
|
||||
tmp = ggml_rope_ext_inplace(ctx0, cur,
|
||||
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
||||
}
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
class llm_graph_input_k_shift : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
|
||||
virtual ~llm_graph_input_k_shift() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
ggml_tensor * k_shift; // I32 [kv_size]
|
||||
|
||||
const llama_kv_cache_unified * kv_self;
|
||||
};
|
||||
|
||||
void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
|
||||
GGML_UNUSED(ubatch);
|
||||
|
||||
if (k_shift) {
|
||||
assert(ggml_backend_buffer_is_host(k_shift->buffer));
|
||||
|
||||
int32_t * data = (int32_t *) k_shift->data;
|
||||
|
||||
for (uint32_t i = 0; i < kv_self->size; ++i) {
|
||||
data[i] = kv_self->cells[i].delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_context::build_kv_self_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf) const {
|
||||
auto res = std::make_unique<llm_graph_result>();
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const auto & n_layer = hparams.n_layer;
|
||||
|
||||
const auto & n_embd_head_k = hparams.n_embd_head_k;
|
||||
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
||||
|
||||
//GGML_ASSERT(kv_self->size == n_ctx);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_k_shift>(kv_self.get());
|
||||
|
||||
inp->k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_ctx);
|
||||
ggml_set_input(inp->k_shift);
|
||||
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
|
||||
const bool is_swa = hparams.is_swa(il);
|
||||
|
||||
// note: the swa rope params could become part of the cparams in the future
|
||||
// if we decide to make them configurable, like the non-sliding ones
|
||||
const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
||||
const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
||||
|
||||
ggml_tensor * rope_factors = kv_self->cbs.get_rope_factors(n_ctx_per_seq(), il);
|
||||
|
||||
ggml_tensor * k =
|
||||
ggml_view_3d(ctx0, kv_self->k_l[il],
|
||||
n_embd_head_k, n_head_kv, kv_self->size,
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
|
||||
0);
|
||||
|
||||
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf) const {
|
||||
auto res = std::make_unique<llm_graph_result>();
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const auto & ids = kv_self->defrag_info.ids;
|
||||
|
||||
#if 0
|
||||
// CPU defrag
|
||||
//
|
||||
// TODO: optimizations are possible:
|
||||
// - multiple threads
|
||||
// - avoid copying to the host memory when already there
|
||||
//
|
||||
// likely not worth the effort, as we have ggml_graph based defrag
|
||||
//
|
||||
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
const uint32_t kv_size = size;
|
||||
|
||||
std::vector<uint8_t> buf_k;
|
||||
std::vector<uint8_t> buf_v;
|
||||
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
|
||||
const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
|
||||
|
||||
const size_t v_size_el = ggml_type_size(v_l[il]->type);
|
||||
const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
|
||||
|
||||
buf_k.resize(k_size);
|
||||
buf_v.resize(v_size);
|
||||
|
||||
ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
|
||||
ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
|
||||
|
||||
// batch move [i, i+nm) to [id, id+nm)
|
||||
// note: cells can move only to a lower index
|
||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||
const uint32_t id = ids[i];
|
||||
|
||||
if (i == id || id == n_kv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t nm = 1;
|
||||
|
||||
while (i + nm < n_kv && ids[i + nm] == id + nm) {
|
||||
nm++;
|
||||
}
|
||||
|
||||
// move keys
|
||||
{
|
||||
const int64_t os = i*k_size_row;
|
||||
const int64_t od = id*k_size_row;
|
||||
|
||||
memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
|
||||
}
|
||||
|
||||
// move values (note: they are transposed)
|
||||
{
|
||||
const int64_t os = i;
|
||||
const int64_t od = id;
|
||||
|
||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||
memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
|
||||
}
|
||||
}
|
||||
|
||||
i += nm - 1;
|
||||
}
|
||||
|
||||
ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
|
||||
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
|
||||
}
|
||||
#else
|
||||
for (uint32_t i = 0; i < ids.size(); ++i) {
|
||||
const uint32_t id = ids[i];
|
||||
|
||||
if (i == id || id == ids.size()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t nm = 1;
|
||||
|
||||
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
||||
nm++;
|
||||
}
|
||||
|
||||
for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il],
|
||||
n_embd_k_gqa, nm,
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i));
|
||||
|
||||
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il],
|
||||
n_embd_k_gqa, nm,
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id));
|
||||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
|
||||
if (cparams.flash_attn) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
n_embd_v_gqa, nm,
|
||||
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
|
||||
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
n_embd_v_gqa, nm,
|
||||
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
|
||||
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id));
|
||||
} else {
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
nm, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
|
||||
ggml_row_size(kv_self->v_l[il]->type, i));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
nm, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
|
||||
ggml_row_size(kv_self->v_l[il]->type, id));
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
||||
}
|
||||
|
||||
i += nm - 1;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||
#endif
|
||||
|
||||
return res;
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
return kv_self;
|
||||
}
|
||||
|
||||
void llama_context::kv_self_update() {
|
||||
auto & kv = kv_self;
|
||||
|
||||
bool need_reserve = false;
|
||||
|
||||
if (kv->has_shift) {
|
||||
if (!kv->get_can_shift()) {
|
||||
GGML_ABORT("The current context does not support K-shift");
|
||||
}
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
|
||||
|
||||
// apply K-shift if needed
|
||||
if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
|
||||
auto * gf = graph_init();
|
||||
|
||||
auto res = build_kv_self_shift(ctx_compute.get(), gf);
|
||||
|
||||
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
||||
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
graph_compute(gf, false);
|
||||
|
||||
need_reserve = true;
|
||||
}
|
||||
|
||||
{
|
||||
kv->has_shift = false;
|
||||
|
||||
for (uint32_t i = 0; i < kv->size; ++i) {
|
||||
kv->cells[i].delta = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// defragment the KV cache if needed
|
||||
if (kv->do_defrag) {
|
||||
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
|
||||
|
||||
if (kv->defrag_prepare(graph_max_nodes())) {
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
|
||||
auto * gf = graph_init();
|
||||
|
||||
auto res = build_kv_self_defrag(ctx_compute.get(), gf);
|
||||
|
||||
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
||||
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
graph_compute(gf, false);
|
||||
|
||||
need_reserve = true;
|
||||
}
|
||||
|
||||
kv->do_defrag = false;
|
||||
}
|
||||
need_reserve = kv_self->update(*this);
|
||||
|
||||
// reserve a worst case graph if needed
|
||||
if (need_reserve) {
|
||||
@@ -797,7 +462,7 @@ void llama_context::kv_self_update() {
|
||||
uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
// simulate full KV cache
|
||||
kv_self->n = kv_self->size;
|
||||
kv_self->set_full();
|
||||
|
||||
llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||
llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||
@@ -818,9 +483,6 @@ enum llama_pooling_type llama_context::pooling_type() const {
|
||||
}
|
||||
|
||||
float * llama_context::get_logits() {
|
||||
// reorder logits for backward compatibility
|
||||
output_reorder();
|
||||
|
||||
return logits;
|
||||
}
|
||||
|
||||
@@ -863,9 +525,6 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
}
|
||||
|
||||
float * llama_context::get_embeddings() {
|
||||
// reorder embeddings for backward compatibility
|
||||
output_reorder();
|
||||
|
||||
return embd;
|
||||
}
|
||||
|
||||
@@ -1017,8 +676,8 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
}
|
||||
|
||||
// temporary allocate memory for the input batch if needed
|
||||
// TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1);
|
||||
// note: during encode, we always pass the full sequence starting from pos = 0
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : 0);
|
||||
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
const int32_t n_tokens = batch.n_tokens;
|
||||
@@ -1047,7 +706,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||
llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||
|
||||
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
|
||||
|
||||
@@ -1181,9 +840,11 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
// temporary allocate memory for the input batch if needed
|
||||
// TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1);
|
||||
// TODO: this is incorrect for multiple sequences because get_pos_max() is the maximum across all sequences
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1);
|
||||
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
|
||||
@@ -1195,7 +856,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
const int64_t n_tokens_all = batch.n_tokens;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
llama_kv_cache_guard kv_guard(kv_self.get());
|
||||
llama_kv_cache_guard kv_guard(kv_self);
|
||||
|
||||
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
||||
|
||||
@@ -1236,11 +897,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
n_outputs_all = 1;
|
||||
}
|
||||
|
||||
const bool logits_all = n_outputs_all == n_tokens_all;
|
||||
|
||||
sbatch.from_batch(batch, n_embd,
|
||||
/* simple_split */ !kv_self->recurrent,
|
||||
/* logits_all */ logits_all);
|
||||
llama_sbatch sbatch = kv_self->sbatch_init(batch, /* logits_all */ n_outputs_all == n_tokens_all);
|
||||
|
||||
// reserve output buffer
|
||||
if (output_reserve(n_outputs_all) < n_outputs_all) {
|
||||
@@ -1254,22 +911,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
int64_t n_outputs_prev = 0;
|
||||
|
||||
while (sbatch.n_tokens > 0) {
|
||||
llama_ubatch ubatch = llama_ubatch();
|
||||
|
||||
const auto & n_ubatch = cparams.n_ubatch;
|
||||
|
||||
if (kv_self->recurrent) {
|
||||
if (embd_pooled) {
|
||||
// Pooled embeddings cannot be split across ubatches (yet)
|
||||
ubatch = sbatch.split_seq(cparams.n_ubatch);
|
||||
} else {
|
||||
// recurrent model architectures are easier to implement
|
||||
// with equal-length sequences
|
||||
ubatch = sbatch.split_equal(cparams.n_ubatch);
|
||||
}
|
||||
} else {
|
||||
ubatch = sbatch.split_simple(n_ubatch);
|
||||
}
|
||||
llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
|
||||
|
||||
// count the outputs in this u_batch
|
||||
{
|
||||
@@ -1289,24 +931,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
}
|
||||
|
||||
// find KV slot
|
||||
{
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!kv_self->recurrent) {
|
||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||
// after enough generations, the benefit from this heuristic disappears
|
||||
// if we start defragmenting the cache, the benefit from this will be more important
|
||||
const uint32_t pad = kv_self->get_padding(cparams);
|
||||
kv_self->n = std::min(kv_self->size, std::max(pad, GGML_PAD(kv_self->cell_max(), pad)));
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self->n, kv_self->used, kv_self->head);
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
||||
|
||||
@@ -1420,43 +1050,68 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// finalize the batch processing
|
||||
kv_guard.commit();
|
||||
|
||||
// set to total number of outputs in the batch, for use in llama_get_logits_ith
|
||||
n_outputs = n_outputs_all;
|
||||
|
||||
// set output mappings
|
||||
{
|
||||
bool sorted_output = true;
|
||||
|
||||
GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all);
|
||||
auto & out_ids = sbatch.out_ids;
|
||||
|
||||
GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all);
|
||||
|
||||
for (int64_t i = 0; i < n_outputs_all; ++i) {
|
||||
int64_t out_id = sbatch.out_ids[i];
|
||||
int64_t out_id = out_ids[i];
|
||||
output_ids[out_id] = i;
|
||||
if (out_id != i) {
|
||||
sorted_output = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (sorted_output) {
|
||||
sbatch.out_ids.clear();
|
||||
// make the outputs have the same order they had in the user-provided batch
|
||||
// note: this is mostly relevant for recurrent models atm
|
||||
if (!sorted_output) {
|
||||
const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
const uint32_t n_embd = model.hparams.n_embd;
|
||||
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
|
||||
// TODO: is there something more efficient which also minimizes swaps?
|
||||
// selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
|
||||
for (int32_t i = 0; i < n_outputs - 1; ++i) {
|
||||
int32_t j_min = i;
|
||||
for (int32_t j = i + 1; j < n_outputs; ++j) {
|
||||
if (out_ids[j] < out_ids[j_min]) {
|
||||
j_min = j;
|
||||
}
|
||||
}
|
||||
if (j_min == i) { continue; }
|
||||
std::swap(out_ids[i], out_ids[j_min]);
|
||||
if (logits_size > 0) {
|
||||
for (uint32_t k = 0; k < n_vocab; k++) {
|
||||
std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
|
||||
}
|
||||
}
|
||||
if (embd_size > 0) {
|
||||
for (uint32_t k = 0; k < n_embd; k++) {
|
||||
std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||
for (int32_t i = 0; i < n_outputs; ++i) {
|
||||
output_ids[out_ids[i]] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// set to total number of outputs in the batch, for use in llama_get_logits_ith
|
||||
n_outputs = n_outputs_all;
|
||||
|
||||
// wait for the computation to finish (automatically done when obtaining the model output)
|
||||
//synchronize();
|
||||
|
||||
// decide if we need to defrag the kv cache
|
||||
if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
|
||||
// - do not defrag small contexts (i.e. < 2048 tokens)
|
||||
// - count the padding towards the number of used tokens
|
||||
const float fragmentation = kv_self->n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self->used + kv_self->get_padding(cparams))/float(kv_self->n)) : 0.0f;
|
||||
|
||||
// queue defragmentation for next llama_kv_cache_update
|
||||
if (fragmentation > cparams.defrag_thold) {
|
||||
LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
|
||||
|
||||
kv_self->defrag();
|
||||
}
|
||||
if (cparams.defrag_thold > 0.0f) {
|
||||
kv_self->defrag_sched(cparams.defrag_thold);
|
||||
}
|
||||
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
@@ -1542,44 +1197,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
return n_outputs_max;
|
||||
}
|
||||
|
||||
void llama_context::output_reorder() {
|
||||
auto & out_ids = sbatch.out_ids;
|
||||
if (!out_ids.empty()) {
|
||||
const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
const uint32_t n_embd = model.hparams.n_embd;
|
||||
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
|
||||
// TODO: is there something more efficient which also minimizes swaps?
|
||||
// selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
|
||||
for (int32_t i = 0; i < n_outputs - 1; ++i) {
|
||||
int32_t j_min = i;
|
||||
for (int32_t j = i + 1; j < n_outputs; ++j) {
|
||||
if (out_ids[j] < out_ids[j_min]) {
|
||||
j_min = j;
|
||||
}
|
||||
}
|
||||
if (j_min == i) { continue; }
|
||||
std::swap(out_ids[i], out_ids[j_min]);
|
||||
if (logits_size > 0) {
|
||||
for (uint32_t k = 0; k < n_vocab; k++) {
|
||||
std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
|
||||
}
|
||||
}
|
||||
if (embd_size > 0) {
|
||||
for (uint32_t k = 0; k < n_embd; k++) {
|
||||
std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||
for (int32_t i = 0; i < n_outputs; ++i) {
|
||||
output_ids[out_ids[i]] = i;
|
||||
}
|
||||
out_ids.clear();
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// graph
|
||||
//
|
||||
@@ -1616,7 +1233,7 @@ llm_graph_result_ptr llama_context::graph_build(
|
||||
/*.backend_cpu =*/ backend_cpu,
|
||||
/*.cvec =*/ &cvec,
|
||||
/*.loras =*/ &loras,
|
||||
/*.memory =*/ kv_self.get(),
|
||||
/*.memory =*/ memory.get(),
|
||||
/*.cross =*/ &cross,
|
||||
/*.n_outputs =*/ n_outputs,
|
||||
/*.cb =*/ graph_get_cb(),
|
||||
@@ -2020,8 +1637,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
{
|
||||
LLAMA_LOG_DEBUG("%s: - writing output ids\n", __func__);
|
||||
|
||||
output_reorder();
|
||||
|
||||
const auto n_outputs = this->n_outputs;
|
||||
const auto & output_ids = this->output_ids;
|
||||
|
||||
@@ -2075,6 +1690,8 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
kv_self->state_write(io);
|
||||
|
||||
return io.n_bytes();
|
||||
@@ -2159,6 +1776,8 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
kv_self->state_read(io);
|
||||
|
||||
return io.n_bytes();
|
||||
@@ -2167,6 +1786,8 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
|
||||
size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
|
||||
GGML_UNUSED(seq_id);
|
||||
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
kv_self->state_write(io, seq_id);
|
||||
|
||||
return io.n_bytes();
|
||||
@@ -2175,6 +1796,8 @@ size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id s
|
||||
size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
|
||||
GGML_UNUSED(seq_id);
|
||||
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
kv_self->state_read(io, seq_id);
|
||||
|
||||
return io.n_bytes();
|
||||
@@ -2530,7 +2153,7 @@ void llama_kv_cache_seq_cp(
|
||||
llama_seq_id seq_id_dst,
|
||||
llama_pos p0,
|
||||
llama_pos p1) {
|
||||
return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
|
||||
llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
|
||||
}
|
||||
|
||||
void llama_kv_self_seq_cp(
|
||||
@@ -2544,14 +2167,14 @@ void llama_kv_self_seq_cp(
|
||||
return;
|
||||
}
|
||||
|
||||
return kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
||||
kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
void llama_kv_cache_seq_keep(
|
||||
llama_context * ctx,
|
||||
llama_seq_id seq_id) {
|
||||
return llama_kv_self_seq_keep(ctx, seq_id);
|
||||
llama_kv_self_seq_keep(ctx, seq_id);
|
||||
}
|
||||
|
||||
void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
|
||||
@@ -2560,7 +2183,7 @@ void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
|
||||
return;
|
||||
}
|
||||
|
||||
return kv->seq_keep(seq_id);
|
||||
kv->seq_keep(seq_id);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
@@ -2570,7 +2193,7 @@ void llama_kv_cache_seq_add(
|
||||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
llama_pos delta) {
|
||||
return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
|
||||
llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
|
||||
}
|
||||
|
||||
void llama_kv_self_seq_add(
|
||||
@@ -2584,7 +2207,7 @@ void llama_kv_self_seq_add(
|
||||
return;
|
||||
}
|
||||
|
||||
return kv->seq_add(seq_id, p0, p1, delta);
|
||||
kv->seq_add(seq_id, p0, p1, delta);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
@@ -2594,7 +2217,7 @@ void llama_kv_cache_seq_div(
|
||||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
int d) {
|
||||
return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
|
||||
llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
|
||||
}
|
||||
|
||||
void llama_kv_self_seq_div(
|
||||
@@ -2608,7 +2231,7 @@ void llama_kv_self_seq_div(
|
||||
return;
|
||||
}
|
||||
|
||||
return kv->seq_div(seq_id, p0, p1, d);
|
||||
kv->seq_div(seq_id, p0, p1, d);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
@@ -2627,7 +2250,7 @@ llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
|
||||
|
||||
// deprecated
|
||||
void llama_kv_cache_defrag(llama_context * ctx) {
|
||||
return llama_kv_self_defrag(ctx);
|
||||
llama_kv_self_defrag(ctx);
|
||||
}
|
||||
|
||||
void llama_kv_self_defrag(llama_context * ctx) {
|
||||
@@ -2636,7 +2259,8 @@ void llama_kv_self_defrag(llama_context * ctx) {
|
||||
return;
|
||||
}
|
||||
|
||||
return kv->defrag();
|
||||
// force defrag
|
||||
kv->defrag_sched(-1.0f);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
|
||||
@@ -27,7 +27,12 @@ struct llama_context {
|
||||
|
||||
void synchronize();
|
||||
|
||||
const llama_model & get_model() const;
|
||||
const llama_model & get_model() const;
|
||||
const llama_cparams & get_cparams() const;
|
||||
|
||||
ggml_backend_sched_t get_sched() const;
|
||||
|
||||
ggml_context * get_ctx_compute() const;
|
||||
|
||||
uint32_t n_ctx() const;
|
||||
uint32_t n_ctx_per_seq() const;
|
||||
@@ -137,49 +142,30 @@ private:
|
||||
// Returns max number of outputs for which space was reserved.
|
||||
int32_t output_reserve(int32_t n_outputs);
|
||||
|
||||
// make the outputs have the same order they had in the user-provided batch
|
||||
// TODO: maybe remove this
|
||||
void output_reorder();
|
||||
|
||||
//
|
||||
// graph
|
||||
//
|
||||
|
||||
public:
|
||||
int32_t graph_max_nodes() const;
|
||||
|
||||
// zero-out inputs and create the ctx_compute for the compute graph
|
||||
ggml_cgraph * graph_init();
|
||||
|
||||
llm_graph_result_ptr graph_build(
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
const llama_ubatch & ubatch,
|
||||
llm_graph_type gtype);
|
||||
|
||||
// returns the result of ggml_backend_sched_graph_compute_async execution
|
||||
ggml_status graph_compute(
|
||||
ggml_cgraph * gf,
|
||||
bool batched);
|
||||
|
||||
private:
|
||||
llm_graph_result_ptr graph_build(
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
const llama_ubatch & ubatch,
|
||||
llm_graph_type gtype);
|
||||
|
||||
llm_graph_cb graph_get_cb() const;
|
||||
|
||||
// used by kv_self_update()
|
||||
ggml_tensor * build_rope_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale) const;
|
||||
|
||||
llm_graph_result_ptr build_kv_self_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf) const;
|
||||
|
||||
llm_graph_result_ptr build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf) const;
|
||||
|
||||
// TODO: read/write lora adapters and cvec
|
||||
size_t state_write_data(llama_io_write_i & io);
|
||||
size_t state_read_data (llama_io_read_i & io);
|
||||
@@ -196,11 +182,10 @@ private:
|
||||
llama_cparams cparams;
|
||||
llama_adapter_cvec cvec;
|
||||
llama_adapter_loras loras;
|
||||
llama_sbatch sbatch;
|
||||
|
||||
llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
|
||||
|
||||
std::unique_ptr<llama_kv_cache_unified> kv_self;
|
||||
std::unique_ptr<llama_memory_i> memory;
|
||||
|
||||
// TODO: remove
|
||||
bool logits_all = false;
|
||||
|
||||
@@ -284,24 +284,7 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
|
||||
|
||||
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
|
||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||
const uint32_t cell_id = i + kv_self->head;
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// TODO: this should not mutate the KV cache !
|
||||
llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
|
||||
|
||||
// prevent out-of-bound sources
|
||||
if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) {
|
||||
kv_cell.src = cell_id;
|
||||
}
|
||||
|
||||
data[i] = kv_cell.src;
|
||||
|
||||
// TODO: do not mutate the KV cache
|
||||
// ensure copy only happens once
|
||||
if (kv_cell.src != (int32_t) cell_id) {
|
||||
kv_cell.src = cell_id;
|
||||
}
|
||||
data[i] = kv_self->s_copy(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -317,18 +300,7 @@ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
|
||||
|
||||
// clear unused states
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
const uint32_t cell_id = i + kv_self->head;
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// TODO: this should not mutate the KV cache !
|
||||
llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
|
||||
|
||||
data[i] = (float) (kv_cell.src >= 0);
|
||||
|
||||
// only clear once
|
||||
if (kv_cell.src < 0) {
|
||||
kv_cell.src = cell_id;
|
||||
}
|
||||
data[i] = kv_self->s_mask(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1105,7 +1077,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
|
||||
|
||||
@@ -1122,7 +1094,7 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_s_mask() const {
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
|
||||
|
||||
@@ -1436,8 +1408,6 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
|
||||
// store to KV cache
|
||||
{
|
||||
GGML_ASSERT(!kv_self->recurrent);
|
||||
|
||||
const auto kv_head = kv_self->head;
|
||||
|
||||
GGML_ASSERT(kv_self->size == n_ctx);
|
||||
@@ -1587,7 +1557,7 @@ ggml_tensor * llm_graph_context::build_copy_mask_state(
|
||||
ggml_tensor * state_mask,
|
||||
int32_t n_state,
|
||||
int32_t n_seqs) const {
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
|
||||
const auto n_kv = kv_self->n;
|
||||
const auto kv_head = kv_self->head;
|
||||
@@ -1619,7 +1589,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
|
||||
const auto token_shift_count = hparams.token_shift_count;
|
||||
|
||||
@@ -1640,7 +1610,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
|
||||
ggml_tensor * token_shift,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
|
||||
const auto token_shift_count = hparams.token_shift_count;
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
@@ -19,6 +19,7 @@ struct llama_cparams;
|
||||
|
||||
class llama_memory_i;
|
||||
class llama_kv_cache_unified;
|
||||
class llama_kv_cache_recurrent;
|
||||
|
||||
// certain models (typically multi-modal) can produce different types of graphs
|
||||
enum llm_graph_type {
|
||||
@@ -186,26 +187,26 @@ public:
|
||||
|
||||
class llm_graph_input_s_copy : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
|
||||
llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
|
||||
virtual ~llm_graph_input_s_copy() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
ggml_tensor * s_copy; // I32 [kv_size]
|
||||
|
||||
const llama_kv_cache_unified * kv_self;
|
||||
const llama_kv_cache_recurrent * kv_self;
|
||||
};
|
||||
|
||||
class llm_graph_input_s_mask : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
|
||||
llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
|
||||
virtual ~llm_graph_input_s_mask() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
ggml_tensor * s_mask; // F32 [1, n_kv]
|
||||
|
||||
const llama_kv_cache_unified * kv_self;
|
||||
const llama_kv_cache_recurrent * kv_self;
|
||||
};
|
||||
|
||||
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
||||
@@ -350,8 +351,8 @@ struct llm_graph_params {
|
||||
const llama_cparams & cparams;
|
||||
const llama_ubatch & ubatch;
|
||||
|
||||
ggml_backend_sched * sched;
|
||||
ggml_backend * backend_cpu;
|
||||
ggml_backend_sched_t sched;
|
||||
ggml_backend_t backend_cpu;
|
||||
|
||||
const llama_adapter_cvec * cvec;
|
||||
const llama_adapter_loras * loras;
|
||||
@@ -402,9 +403,9 @@ struct llm_graph_context {
|
||||
|
||||
ggml_context * ctx0 = nullptr;
|
||||
|
||||
ggml_backend_sched * sched;
|
||||
ggml_backend_sched_t sched;
|
||||
|
||||
ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
||||
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
||||
|
||||
const llama_adapter_cvec * cvec;
|
||||
const llama_adapter_loras * loras;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,32 +2,72 @@
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-io.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-memory.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <functional>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
struct llama_cparams;
|
||||
struct llama_hparams;
|
||||
struct llama_ubatch;
|
||||
struct llama_sbatch;
|
||||
struct llama_model;
|
||||
struct llama_context;
|
||||
|
||||
struct llama_kv_cache : public llama_memory_i {
|
||||
using llama_memory_i::llama_memory_i;
|
||||
virtual ~llama_kv_cache() = default;
|
||||
|
||||
virtual void restore() = 0; // call if batch processing fails - restores the cache state
|
||||
virtual void commit() = 0; // call after successful batch processing - clears any pending state
|
||||
// call if batch processing fails - restores the cache state
|
||||
virtual void restore() = 0;
|
||||
|
||||
virtual int32_t get_n_tokens() const = 0;
|
||||
virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
|
||||
// call after successful batch processing - clears any pending state
|
||||
virtual void commit() = 0;
|
||||
|
||||
virtual bool get_can_shift() const = 0;
|
||||
// process any pending defrag/shift/etc. operations
|
||||
// optionally call once before processing a new batch
|
||||
virtual bool update(llama_context & lctx) = 0;
|
||||
|
||||
// schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
|
||||
virtual void defrag_sched(float thold) = 0;
|
||||
|
||||
// simulate full cache, used for allocating worst-case compute buffers
|
||||
virtual void set_full() = 0;
|
||||
|
||||
//
|
||||
// batch processing
|
||||
//
|
||||
|
||||
virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
|
||||
|
||||
// different KV caches require different batch splitting strategies
|
||||
virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0;
|
||||
|
||||
// find an empty slot of size "n_tokens" in the cache
|
||||
virtual bool find_slot(const llama_ubatch & batch) = 0;
|
||||
|
||||
// getters
|
||||
virtual int32_t get_n_tokens() const = 0;
|
||||
virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
|
||||
virtual llama_pos get_pos_max() const = 0;
|
||||
virtual bool get_can_shift() const = 0;
|
||||
|
||||
bool get_can_edit() const override { return get_can_shift(); }
|
||||
|
||||
//
|
||||
// state write/read
|
||||
//
|
||||
|
||||
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
|
||||
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
|
||||
};
|
||||
|
||||
//
|
||||
// llama_kv_cache_guard
|
||||
//
|
||||
|
||||
struct llama_kv_cache_guard {
|
||||
llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
|
||||
|
||||
@@ -43,65 +83,50 @@ private:
|
||||
llama_kv_cache * kv;
|
||||
};
|
||||
|
||||
struct llama_kv_cell {
|
||||
llama_pos pos = -1;
|
||||
llama_pos delta = 0;
|
||||
int32_t src = -1; // used by recurrent state models to copy states
|
||||
int32_t tail = -1;
|
||||
//
|
||||
// llama_kv_cache_unified
|
||||
//
|
||||
|
||||
std::set<llama_seq_id> seq_id;
|
||||
|
||||
bool has_seq_id(const llama_seq_id & id) const {
|
||||
return seq_id.find(id) != seq_id.end();
|
||||
}
|
||||
|
||||
bool is_empty() const {
|
||||
return seq_id.empty();
|
||||
}
|
||||
|
||||
bool is_same_seq(const llama_kv_cell & other) const {
|
||||
return seq_id == other.seq_id;
|
||||
}
|
||||
};
|
||||
|
||||
// ring-buffer of cached KV data
|
||||
// TODO: pimpl
|
||||
// TODO: add notion of max sequences
|
||||
class llama_kv_cache_unified : public llama_kv_cache {
|
||||
public:
|
||||
// can be used to query data from the model if needed
|
||||
struct callbacks {
|
||||
std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
|
||||
struct kv_cell {
|
||||
llama_pos pos = -1;
|
||||
llama_pos delta = 0;
|
||||
|
||||
std::set<llama_seq_id> seq_id;
|
||||
|
||||
bool has_seq_id(const llama_seq_id & id) const {
|
||||
return seq_id.find(id) != seq_id.end();
|
||||
}
|
||||
|
||||
bool is_empty() const {
|
||||
return seq_id.empty();
|
||||
}
|
||||
|
||||
bool is_same_seq(const kv_cell & other) const {
|
||||
return seq_id == other.seq_id;
|
||||
}
|
||||
};
|
||||
|
||||
static uint32_t get_padding(const llama_cparams & cparams);
|
||||
|
||||
llama_kv_cache_unified(
|
||||
const llama_hparams & hparams,
|
||||
callbacks cbs);
|
||||
|
||||
virtual ~llama_kv_cache_unified() = default;
|
||||
|
||||
// TODO: become constructor
|
||||
bool init(
|
||||
const llama_model & model, // TODO: do not reference the model
|
||||
const llama_cparams & cparams,
|
||||
const llama_model & model,
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool v_trans,
|
||||
bool offload,
|
||||
uint32_t kv_size,
|
||||
bool offload);
|
||||
uint32_t padding);
|
||||
|
||||
int32_t get_n_tokens() const override;
|
||||
int32_t get_used_cells() const override;
|
||||
~llama_kv_cache_unified() = default;
|
||||
|
||||
size_t total_size() const;
|
||||
|
||||
// TODO: better data structures to reduce the cost of this operation
|
||||
llama_pos pos_max() const;
|
||||
//
|
||||
// llama_memory_i
|
||||
//
|
||||
|
||||
void clear() override;
|
||||
void defrag() override;
|
||||
|
||||
virtual void restore() override;
|
||||
virtual void commit() override;
|
||||
|
||||
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||
@@ -111,63 +136,40 @@ public:
|
||||
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||
|
||||
bool get_can_shift() const override;
|
||||
//
|
||||
// llama_kv_cache
|
||||
//
|
||||
|
||||
void restore() override;
|
||||
void commit() override;
|
||||
|
||||
bool update(llama_context & ctx) override;
|
||||
|
||||
void defrag_sched(float thold) override;
|
||||
|
||||
void set_full() override;
|
||||
|
||||
llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
|
||||
|
||||
llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
|
||||
|
||||
// find an empty slot of size "n_tokens" in the cache
|
||||
// updates the cache head
|
||||
// Note: On success, it's important that cache.head points
|
||||
// to the first cell of the slot.
|
||||
bool find_slot(const llama_ubatch & batch);
|
||||
bool find_slot(const llama_ubatch & batch) override;
|
||||
|
||||
// TODO: maybe not needed
|
||||
uint32_t get_padding(const llama_cparams & cparams) const;
|
||||
int32_t get_n_tokens() const override;
|
||||
int32_t get_used_cells() const override;
|
||||
|
||||
// find how many cells are currently in use
|
||||
uint32_t cell_max() const;
|
||||
// TODO: better data structures to reduce the cost of this operation
|
||||
llama_pos get_pos_max() const override;
|
||||
|
||||
size_t size_k_bytes() const;
|
||||
size_t size_v_bytes() const;
|
||||
|
||||
// defrag
|
||||
|
||||
struct {
|
||||
std::vector<uint32_t> ids;
|
||||
} defrag_info;
|
||||
|
||||
// return true if cells have been moved
|
||||
bool defrag_prepare(int32_t n_max_nodes);
|
||||
|
||||
// commit/restore cache
|
||||
|
||||
struct slot_range {
|
||||
uint32_t c0 = 0; // note: these are cell indices, not sequence positions
|
||||
uint32_t c1 = 0;
|
||||
};
|
||||
|
||||
// pending cell updates that are not yet committed
|
||||
struct {
|
||||
std::vector<slot_range> ranges;
|
||||
} pending;
|
||||
bool get_can_shift() const override;
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1);
|
||||
|
||||
// members
|
||||
|
||||
const llama_hparams & hparams;
|
||||
|
||||
callbacks cbs;
|
||||
|
||||
bool has_shift = false;
|
||||
bool do_defrag = false;
|
||||
|
||||
// TODO: remove this and implement llama_kv_cache_recurrent instead
|
||||
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
||||
|
||||
bool v_trans = true; // the value tensor is transposed
|
||||
bool can_shift = false;
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||
|
||||
// Note: The value of head isn't only used to optimize searching
|
||||
// for a free KV slot. llama_decode_impl also uses it, so it
|
||||
@@ -179,18 +181,213 @@ public:
|
||||
// computed before each graph build
|
||||
uint32_t n = 0;
|
||||
|
||||
std::vector<llama_kv_cell> cells;
|
||||
std::vector<kv_cell> cells;
|
||||
|
||||
std::vector<ggml_tensor *> k_l; // per layer
|
||||
std::vector<ggml_tensor *> v_l;
|
||||
|
||||
private:
|
||||
const llama_model & model;
|
||||
const llama_hparams & hparams;
|
||||
|
||||
bool has_shift = false;
|
||||
bool do_defrag = false;
|
||||
|
||||
bool v_trans = true; // the value tensor is transposed
|
||||
bool can_shift = false;
|
||||
|
||||
// required padding
|
||||
uint32_t padding = 1;
|
||||
|
||||
ggml_type type_k = GGML_TYPE_F16;
|
||||
ggml_type type_v = GGML_TYPE_F16;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
// defrag
|
||||
struct {
|
||||
std::vector<uint32_t> ids;
|
||||
} defrag_info;
|
||||
|
||||
// return true if cells have been moved
|
||||
bool defrag_prepare(int32_t n_max_nodes);
|
||||
|
||||
// commit/restore cache
|
||||
struct slot_range {
|
||||
uint32_t c0 = 0; // note: these are cell indices, not sequence positions
|
||||
uint32_t c1 = 0;
|
||||
};
|
||||
|
||||
// pending cell updates that are not yet committed
|
||||
struct {
|
||||
std::vector<slot_range> ranges;
|
||||
} pending;
|
||||
|
||||
// find how many cells are currently in use
|
||||
uint32_t cell_max() const;
|
||||
|
||||
size_t total_size() const;
|
||||
|
||||
size_t size_k_bytes() const;
|
||||
size_t size_v_bytes() const;
|
||||
|
||||
ggml_tensor * build_rope_shift(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale) const;
|
||||
|
||||
llm_graph_result_ptr build_graph_shift(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf) const;
|
||||
|
||||
llm_graph_result_ptr build_graph_defrag(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf) const;
|
||||
|
||||
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
||||
|
||||
bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
|
||||
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
||||
};
|
||||
|
||||
//
|
||||
// llama_kv_cache_recurrent
|
||||
//
|
||||
|
||||
class llama_kv_cache_recurrent : public llama_kv_cache {
|
||||
public:
|
||||
struct kv_cell {
|
||||
llama_pos pos = -1;
|
||||
int32_t src = -1; // used to copy states
|
||||
int32_t tail = -1;
|
||||
|
||||
std::set<llama_seq_id> seq_id;
|
||||
|
||||
bool has_seq_id(const llama_seq_id & id) const {
|
||||
return seq_id.find(id) != seq_id.end();
|
||||
}
|
||||
|
||||
bool is_empty() const {
|
||||
return seq_id.empty();
|
||||
}
|
||||
|
||||
bool is_same_seq(const kv_cell & other) const {
|
||||
return seq_id == other.seq_id;
|
||||
}
|
||||
};
|
||||
|
||||
llama_kv_cache_recurrent(
|
||||
const llama_model & model,
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool offload,
|
||||
uint32_t kv_size);
|
||||
|
||||
~llama_kv_cache_recurrent() = default;
|
||||
|
||||
//
|
||||
// llama_memory_i
|
||||
//
|
||||
|
||||
void clear() override;
|
||||
|
||||
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||
void seq_keep(llama_seq_id seq_id) override;
|
||||
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
|
||||
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
|
||||
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||
|
||||
//
|
||||
// llama_kv_cache
|
||||
//
|
||||
|
||||
void restore() override;
|
||||
void commit() override;
|
||||
|
||||
bool update(llama_context & lctx) override;
|
||||
|
||||
void defrag_sched(float thold) override;
|
||||
|
||||
void set_full() override;
|
||||
|
||||
llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
|
||||
|
||||
llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
|
||||
|
||||
bool find_slot(const llama_ubatch & batch) override;
|
||||
|
||||
int32_t get_n_tokens() const override;
|
||||
int32_t get_used_cells() const override;
|
||||
|
||||
// TODO: better data structures to reduce the cost of this operation
|
||||
llama_pos get_pos_max() const override;
|
||||
|
||||
bool get_can_shift() const override;
|
||||
|
||||
// TODO: temporary methods - they are not really const as they do const_cast<>, fix this
|
||||
int32_t s_copy(int i) const;
|
||||
float s_mask(int i) const;
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||
|
||||
// Note: The value of head isn't only used to optimize searching
|
||||
// for a free KV slot. llama_decode_impl also uses it, so it
|
||||
// cannot be freely changed after a slot has been allocated.
|
||||
uint32_t head = 0;
|
||||
uint32_t size = 0;
|
||||
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
||||
|
||||
// computed before each graph build
|
||||
uint32_t n = 0;
|
||||
|
||||
std::vector<kv_cell> cells;
|
||||
|
||||
std::vector<ggml_tensor *> k_l; // per layer
|
||||
std::vector<ggml_tensor *> v_l;
|
||||
|
||||
private:
|
||||
//const llama_model & model;
|
||||
const llama_hparams & hparams;
|
||||
|
||||
// commit/restore cache
|
||||
// TODO: rework for recurrent cache
|
||||
struct slot_range {
|
||||
uint32_t c0 = 0; // note: these are cell indices, not sequence positions
|
||||
uint32_t c1 = 0;
|
||||
};
|
||||
|
||||
// pending cell updates that are not yet committed
|
||||
struct {
|
||||
std::vector<slot_range> ranges;
|
||||
} pending;
|
||||
|
||||
ggml_type type_k = GGML_TYPE_F16;
|
||||
ggml_type type_v = GGML_TYPE_F16;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
// find how many cells are currently in use
|
||||
uint32_t cell_max() const;
|
||||
|
||||
size_t total_size() const;
|
||||
|
||||
size_t size_k_bytes() const;
|
||||
size_t size_v_bytes() const;
|
||||
|
||||
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
||||
|
||||
@@ -198,11 +395,6 @@ private:
|
||||
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
||||
};
|
||||
|
||||
// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
|
||||
//class llama_kv_cache_recurrent : public llama_kv_cache_unified {
|
||||
//public:
|
||||
// using llama_kv_cache_unified::llama_kv_cache_unified;
|
||||
//};
|
||||
|
||||
//
|
||||
// kv cache view
|
||||
|
||||
@@ -2,12 +2,22 @@
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
struct llama_memory_params {
|
||||
// kv cache
|
||||
ggml_type type_k;
|
||||
ggml_type type_v;
|
||||
|
||||
// parameters for other types of memory
|
||||
// ...
|
||||
};
|
||||
|
||||
// general concept of LLM memory
|
||||
// the KV cache is a type of LLM memory, but there can be other types
|
||||
class llama_memory_i {
|
||||
public:
|
||||
virtual ~llama_memory_i() = default;
|
||||
|
||||
virtual void clear() = 0;
|
||||
virtual void defrag() = 0;
|
||||
|
||||
virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
|
||||
virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
|
||||
|
||||
@@ -80,6 +80,7 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_236B: return "236B";
|
||||
case LLM_TYPE_290B: return "290B";
|
||||
case LLM_TYPE_314B: return "314B";
|
||||
case LLM_TYPE_405B: return "405B";
|
||||
case LLM_TYPE_671B: return "671B";
|
||||
case LLM_TYPE_SMALL: return "0.1B";
|
||||
case LLM_TYPE_MEDIUM: return "0.4B";
|
||||
@@ -582,6 +583,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
switch (hparams.n_layer) {
|
||||
case 32: type = LLM_TYPE_7B; break;
|
||||
case 80: type = LLM_TYPE_70B; break;
|
||||
case 162: type = LLM_TYPE_405B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@@ -773,6 +775,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
// fall through
|
||||
case LLM_ARCH_QWEN2:
|
||||
{
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
|
||||
@@ -1847,7 +1850,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
if (n_ff > 0) {
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
}
|
||||
|
||||
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
||||
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||
@@ -1857,9 +1862,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||
}
|
||||
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
if (n_ff > 0) {
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
}
|
||||
|
||||
// optional MLP bias
|
||||
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
||||
@@ -4445,6 +4452,19 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
|
||||
// choose long/short freq factors based on the context size
|
||||
if (layers[il].rope_freqs != nullptr) {
|
||||
return layers[il].rope_freqs;
|
||||
}
|
||||
|
||||
if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
|
||||
return layers[il].rope_long;
|
||||
}
|
||||
|
||||
return layers[il].rope_short;
|
||||
}
|
||||
|
||||
struct llm_build_llama : public llm_graph_context {
|
||||
llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
@@ -4485,7 +4505,7 @@ struct llm_build_llama : public llm_graph_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@@ -4691,6 +4711,7 @@ struct llm_build_deci : public llm_graph_context {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||
const int64_t n_head = hparams.n_head(il);
|
||||
const int64_t n_ff = hparams.n_ff(il);
|
||||
|
||||
if (n_head == 0) {
|
||||
// attention-free layer of Llama-3_1-Nemotron-51B
|
||||
@@ -4710,7 +4731,7 @@ struct llm_build_deci : public llm_graph_context {
|
||||
} else if (n_head > 0) {
|
||||
// self-attention
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@@ -4766,6 +4787,11 @@ struct llm_build_deci : public llm_graph_context {
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
// FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
|
||||
if (n_head == 0 && n_ff == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// For Granite architecture
|
||||
if (hparams.f_residual_scale) {
|
||||
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
||||
@@ -7192,7 +7218,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for 128k context
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
ggml_tensor* attn_norm_output = build_norm(inpL,
|
||||
model.layers[il].attn_norm,
|
||||
@@ -7944,7 +7970,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL,
|
||||
@@ -8711,7 +8737,7 @@ struct llm_build_mamba : public llm_graph_context {
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
|
||||
const auto kv_head = kv_self->head;
|
||||
|
||||
@@ -9012,7 +9038,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for 128k context
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@@ -9950,7 +9976,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@@ -11314,7 +11340,7 @@ struct llm_build_exaone : public llm_graph_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@@ -11459,7 +11485,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
|
||||
const auto n_tokens = ubatch.n_tokens;
|
||||
const auto n_seqs = ubatch.n_seqs;
|
||||
@@ -11855,7 +11881,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
||||
ggml_tensor *& first_layer_value,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
|
||||
const auto n_tokens = ubatch.n_tokens;
|
||||
const auto n_seqs = ubatch.n_seqs;
|
||||
@@ -12695,7 +12721,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@@ -12815,7 +12841,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
llama_memory_i * llama_model::create_memory() const {
|
||||
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
||||
llama_memory_i * res;
|
||||
|
||||
switch (arch) {
|
||||
@@ -12825,26 +12851,29 @@ llama_memory_i * llama_model::create_memory() const {
|
||||
case LLM_ARCH_RWKV7:
|
||||
case LLM_ARCH_ARWKV7:
|
||||
{
|
||||
res = new llama_kv_cache_unified(hparams, {
|
||||
/*.get_rope_factors =*/ nullptr
|
||||
});
|
||||
res = new llama_kv_cache_recurrent(
|
||||
*this,
|
||||
GGML_TYPE_F32,
|
||||
GGML_TYPE_F32,
|
||||
cparams.offload_kqv,
|
||||
std::max((uint32_t) 1, cparams.n_seq_max));
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
res = new llama_kv_cache_unified(hparams, {
|
||||
/*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
|
||||
// choose long/short freq factors based on the context size
|
||||
if (layers[il].rope_freqs != nullptr) {
|
||||
return layers[il].rope_freqs;
|
||||
}
|
||||
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
||||
|
||||
if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
|
||||
return layers[il].rope_long;
|
||||
}
|
||||
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
||||
|
||||
return layers[il].rope_short;
|
||||
}
|
||||
});
|
||||
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
||||
|
||||
res = new llama_kv_cache_unified(
|
||||
*this,
|
||||
params.type_k,
|
||||
params.type_v,
|
||||
!cparams.flash_attn,
|
||||
cparams.offload_kqv,
|
||||
cparams.n_ctx,
|
||||
padding);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13226,8 +13255,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_DECI:
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_STARCODER:
|
||||
case LLM_ARCH_PLAMO:
|
||||
case LLM_ARCH_ORION:
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
case LLM_ARCH_MINICPM:
|
||||
case LLM_ARCH_XVERSE:
|
||||
@@ -13265,6 +13292,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_PHI2:
|
||||
case LLM_ARCH_PHI3:
|
||||
case LLM_ARCH_PHIMOE:
|
||||
case LLM_ARCH_PLAMO:
|
||||
case LLM_ARCH_GEMMA:
|
||||
case LLM_ARCH_GEMMA2:
|
||||
case LLM_ARCH_GEMMA3:
|
||||
@@ -13272,6 +13300,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_OPENELM:
|
||||
case LLM_ARCH_GPTNEOX:
|
||||
case LLM_ARCH_CODESHELL:
|
||||
case LLM_ARCH_ORION:
|
||||
case LLM_ARCH_NEMOTRON:
|
||||
case LLM_ARCH_EXAONE:
|
||||
case LLM_ARCH_MINICPM3:
|
||||
|
||||
@@ -76,6 +76,7 @@ enum llm_type {
|
||||
LLM_TYPE_236B,
|
||||
LLM_TYPE_290B,
|
||||
LLM_TYPE_314B,
|
||||
LLM_TYPE_405B,
|
||||
LLM_TYPE_671B,
|
||||
LLM_TYPE_SMALL,
|
||||
LLM_TYPE_MEDIUM,
|
||||
@@ -395,8 +396,11 @@ struct llama_model {
|
||||
|
||||
const struct ggml_tensor * get_tensor(const char * name) const;
|
||||
|
||||
ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
|
||||
|
||||
// note: can mutate `cparams`
|
||||
// TODO: move this to new llm_arch_model_i interface
|
||||
llama_memory_i * create_memory() const; // TODO: params
|
||||
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
||||
|
||||
// TODO: move this to new llm_arch_model_i interface
|
||||
llm_graph_result_ptr build_graph(
|
||||
|
||||
@@ -111,7 +111,7 @@ if (NOT WIN32)
|
||||
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
||||
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
|
||||
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../tools/server)
|
||||
endif()
|
||||
|
||||
llama_build(test-quantize-stats.cpp)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { readFileSync } from "fs"
|
||||
import { SchemaConverter } from "../examples/server/public_legacy/json-schema-to-grammar.mjs"
|
||||
import { SchemaConverter } from "../tools/server/public_legacy/json-schema-to-grammar.mjs"
|
||||
|
||||
const [, , file] = process.argv
|
||||
const url = `file://${file}`
|
||||
|
||||
@@ -2765,6 +2765,48 @@ struct test_im2col : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_CONV_2D_DW
|
||||
struct test_conv_2d_dw : public test_case {
|
||||
const std::array<int64_t, 4> ne_input;
|
||||
const std::array<int64_t, 4> ne_kernel;
|
||||
const int stride;
|
||||
const int padding;
|
||||
const int dilation;
|
||||
const bool cwhn;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR6(ne_input, ne_kernel, stride, padding, dilation, cwhn);
|
||||
}
|
||||
|
||||
test_conv_2d_dw(std::array<int64_t, 4> ne_input = {64, 64, 16, 1},
|
||||
std::array<int64_t, 4> ne_kernel = {3, 3, 1, 16},
|
||||
int stride = 1, int padding = 0, int dilation = 1, bool cwhn = false)
|
||||
: ne_input(ne_input), ne_kernel(ne_kernel), stride(stride), padding(padding), dilation(dilation), cwhn(cwhn) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
|
||||
ggml_set_name(input, "input");
|
||||
|
||||
ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
|
||||
ggml_set_name(kernel, "kernel");
|
||||
|
||||
if (cwhn) {
|
||||
// change memory layout to channel-most-contiguous (CWHN),
|
||||
// then permute it back so NE matches the original input
|
||||
input = ggml_cont(ctx, ggml_permute(ctx, input, 1, 2, 0, 3));
|
||||
input = ggml_permute(ctx, input, 2, 0, 1, 3);
|
||||
kernel = ggml_cont(ctx, ggml_permute(ctx, kernel, 2, 3, 1, 0));
|
||||
kernel = ggml_permute(ctx, kernel, 3, 2, 0, 1);
|
||||
}
|
||||
|
||||
ggml_tensor * out = ggml_conv_2d_dw_direct(
|
||||
ctx, kernel, input,
|
||||
stride, stride, padding, padding, dilation, dilation);
|
||||
ggml_set_name(out, "out");
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_CONCAT
|
||||
struct test_concat : public test_case {
|
||||
const ggml_type type;
|
||||
@@ -3975,6 +4017,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
// test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
|
||||
// test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
|
||||
|
||||
test_cases.emplace_back(new test_conv_2d_dw({17, 34, 9, 1}, {3, 3, 1, 9}, 1, 0, 1, false));
|
||||
test_cases.emplace_back(new test_conv_2d_dw({17, 34, 9, 1}, {3, 3, 1, 9}, 1, 0, 1, true));
|
||||
test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, false));
|
||||
test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, true));
|
||||
|
||||
test_cases.emplace_back(new test_conv_transpose_1d());
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
|
||||
@@ -4549,6 +4596,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, false));
|
||||
test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, true));
|
||||
|
||||
return test_cases;
|
||||
}
|
||||
|
||||
|
||||
@@ -187,15 +187,14 @@ int main(void) {
|
||||
/* .bos_token= */ "",
|
||||
/* .eos_token= */ "",
|
||||
},
|
||||
// TODO @ngxson : GLMEdge produces poor result without `[gMASK]<sop>`, so we're temporarily using GLM4 template for it. We should fix this in the future.
|
||||
// {
|
||||
// /* .name= */ "GLMEdge",
|
||||
// /* .template_str= */ "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}<|assistant|>",
|
||||
// /* .expected_output= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>",
|
||||
// /* .expected_output_jinja= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>",
|
||||
// /* .bos_token= */ "",
|
||||
// /* .eos_token= */ "",
|
||||
// },
|
||||
{
|
||||
/* .name= */ "GLMEdge",
|
||||
/* .template_str= */ "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}<|assistant|>",
|
||||
/* .expected_output= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>",
|
||||
/* .expected_output_jinja= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>",
|
||||
/* .bos_token= */ "",
|
||||
/* .eos_token= */ "",
|
||||
},
|
||||
{
|
||||
/* .name= */ "MiniCPM-3B-OpenHermes-2.5-v2-GGUF",
|
||||
/* .template_str= */ U8C("{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}"),
|
||||
|
||||
39
tools/CMakeLists.txt
Normal file
39
tools/CMakeLists.txt
Normal file
@@ -0,0 +1,39 @@
|
||||
# dependencies
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
# third-party
|
||||
|
||||
# ...
|
||||
|
||||
# flags
|
||||
|
||||
llama_add_compile_flags()
|
||||
|
||||
# tools
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(batched-bench)
|
||||
add_subdirectory(gguf-split)
|
||||
add_subdirectory(imatrix)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(quantize)
|
||||
if (LLAMA_BUILD_SERVER)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
add_subdirectory(run)
|
||||
add_subdirectory(tokenize)
|
||||
add_subdirectory(tts)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
# these examples use the backends directly and cannot be built with dynamic loading
|
||||
add_subdirectory(cvector-generator)
|
||||
add_subdirectory(export-lora)
|
||||
add_subdirectory(llava)
|
||||
if (GGML_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
@@ -1,4 +1,4 @@
|
||||
# llama.cpp/examples/imatrix
|
||||
# llama.cpp/tools/imatrix
|
||||
|
||||
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enhance the quality of the quantized models.
|
||||
More information is available here: https://github.com/ggml-org/llama.cpp/pull/4861
|
||||
@@ -46,7 +46,7 @@ private:
|
||||
common_params m_params;
|
||||
std::mutex m_mutex;
|
||||
int m_last_call = 0;
|
||||
std::vector<float> m_src1_data;
|
||||
std::vector<char> m_src1_data;
|
||||
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
|
||||
};
|
||||
|
||||
@@ -93,11 +93,13 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
|
||||
|
||||
if (!is_host) {
|
||||
m_src1_data.resize(ggml_nelements(src1));
|
||||
ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
|
||||
const size_t src1_nbytes = ggml_nbytes(src1);
|
||||
m_src1_data.resize(src1_nbytes);
|
||||
ggml_backend_tensor_get(src1, m_src1_data.data(), 0, src1_nbytes);
|
||||
}
|
||||
|
||||
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
|
||||
const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
|
||||
GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
|
||||
|
||||
// this has been adapted to the new format of storing merged experts in a single 3d tensor
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/6387
|
||||
@@ -144,7 +146,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
|
||||
const int64_t i11 = idx % src1->ne[1];
|
||||
const int64_t i12 = row;
|
||||
const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
|
||||
const float * x = (const float *)(data + i11*src1->nb[1] + i12*src1->nb[2]);
|
||||
|
||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||
e.values[e_start + j] += x[j]*x[j];
|
||||
@@ -180,7 +182,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
++e.ncall;
|
||||
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||
const float * x = data + row * src1->ne[0];
|
||||
const float * x = (const float *) (data + row * src1->nb[1]);
|
||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||
e.values[j] += x[j]*x[j];
|
||||
e.counts[j]++;
|
||||
@@ -1,4 +1,4 @@
|
||||
# llama.cpp/examples/llama-bench
|
||||
# llama.cpp/tools/llama-bench
|
||||
|
||||
Performance testing tool for llama.cpp.
|
||||
|
||||
@@ -35,6 +35,16 @@ llama-mtmd-cli -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF
|
||||
# Pixtral 12B
|
||||
llama-mtmd-cli -hf ggml-org/pixtral-12b-GGUF
|
||||
|
||||
# Qwen 2 VL
|
||||
llama-mtmd-cli -hf ggml-org/Qwen2-VL-2B-Instruct-GGUF
|
||||
llama-mtmd-cli -hf ggml-org/Qwen2-VL-7B-Instruct-GGUF
|
||||
|
||||
# Qwen 2.5 VL
|
||||
llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-3B-Instruct-GGUF
|
||||
llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-7B-Instruct-GGUF
|
||||
llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-32B-Instruct-GGUF
|
||||
llama-mtmd-cli -hf ggml-org/Qwen2.5-VL-72B-Instruct-GGUF
|
||||
|
||||
# Mistral Small 3.1 24B (IQ2_M quantization)
|
||||
llama-mtmd-cli -hf ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF --chat-template mistral-v7
|
||||
```
|
||||
@@ -60,7 +70,17 @@ Built upon `clip.cpp` (similar to `llava.cpp`), `libmtmd` offers several advanta
|
||||
|
||||
## How to obtain `mmproj`
|
||||
|
||||
Multimodal projector (`mmproj`) files are specific to each model architecture. Please refer to the relevant guide for instructions on how to obtain or create them:
|
||||
Multimodal projector (`mmproj`) files are specific to each model architecture.
|
||||
|
||||
For the following models, you can use `convert_hf_to_gguf.py`with `--mmproj` flag to get the `mmproj` file:
|
||||
- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) - Note: 1B variant does not have vision support
|
||||
- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
|
||||
- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
|
||||
- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint
|
||||
- Qwen 2 VL and Qwen 2.5 VL (from [Qwen](https://huggingface.co/Qwen))
|
||||
- [Mistral Small 3.1 24B](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)
|
||||
|
||||
For older models, please refer to the relevant guide for instructions on how to obtain or create them:
|
||||
|
||||
- [LLaVA](../../docs/multimodal/llava.md)
|
||||
- [MobileVLM](../../docs/multimodal/MobileVLM.md)
|
||||
@@ -70,10 +90,3 @@ Multimodal projector (`mmproj`) files are specific to each model architecture. P
|
||||
- [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md)
|
||||
- [IBM Granite Vision](../../docs/multimodal/granitevision.md)
|
||||
- [Google Gemma 3](../../docs/multimodal/gemma3.md)
|
||||
|
||||
For the following models, you can use `convert_hf_to_gguf.py`with `--mmproj` flag to get the `mmproj` file:
|
||||
- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) - Note: 1B variant does not have vision support
|
||||
- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
|
||||
- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
|
||||
- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint
|
||||
- [Mistral Small 3.1 24B](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)
|
||||
@@ -75,6 +75,8 @@
|
||||
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
||||
#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1
|
||||
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
||||
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
|
||||
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
|
||||
|
||||
// mimicpmv
|
||||
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
||||
@@ -249,9 +249,11 @@ struct clip_vision_model {
|
||||
struct ggml_tensor * mm_4_w = nullptr;
|
||||
struct ggml_tensor * mm_4_b = nullptr;
|
||||
|
||||
//GLMV-Edge projection
|
||||
// GLMV-Edge projection
|
||||
struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
|
||||
struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
|
||||
struct ggml_tensor * mm_glm_tok_boi = nullptr;
|
||||
struct ggml_tensor * mm_glm_tok_eoi = nullptr;
|
||||
|
||||
// MobileVLM projection
|
||||
struct ggml_tensor * mm_model_mlp_1_w = nullptr;
|
||||
@@ -1559,6 +1561,13 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
||||
embeddings = ggml_mul(ctx0, embeddings,x);
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
|
||||
}
|
||||
// arrangement of BOI/EOI token embeddings
|
||||
// note: these embeddings are not present in text model, hence we cannot process them as text tokens
|
||||
// see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
|
||||
{
|
||||
embeddings = ggml_concat(ctx0, model.mm_glm_tok_boi, embeddings, 1); // BOI
|
||||
embeddings = ggml_concat(ctx0, embeddings, model.mm_glm_tok_eoi, 1); // EOI
|
||||
}
|
||||
}
|
||||
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
|
||||
@@ -1972,12 +1981,14 @@ struct clip_model_loader {
|
||||
{
|
||||
vision_model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
|
||||
vision_model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
|
||||
vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR,"weight"));
|
||||
vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"weight"));
|
||||
vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"bias"));
|
||||
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
|
||||
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
|
||||
vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
|
||||
vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
|
||||
vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
|
||||
vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
|
||||
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
|
||||
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
|
||||
vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
|
||||
vision_model.mm_glm_tok_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
|
||||
vision_model.mm_glm_tok_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
@@ -2948,6 +2959,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
||||
n_patches /= 4;
|
||||
n_patches += 2; // for BOI and EOI token embeddings
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
|
||||
if (ctx->minicpmv_version == 2) {
|
||||
n_patches = 96;
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user