mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-23 16:37:33 +03:00
Compare commits
21 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
056b50c319 | ||
|
|
f2c72b8f1f | ||
|
|
ec54ac13a8 | ||
|
|
80322ebdaf | ||
|
|
44c51e526b | ||
|
|
1922f87c2f | ||
|
|
345de3cd87 | ||
|
|
9c600bcd4b | ||
|
|
b2704f9028 | ||
|
|
3fab96cd04 | ||
|
|
914eb5ff0c | ||
|
|
8fc17493c3 | ||
|
|
36dafba5c4 | ||
|
|
69e0ecef06 | ||
|
|
062cca58fc | ||
|
|
406f4e3f61 | ||
|
|
53dc8b59bf | ||
|
|
403c9c9cef | ||
|
|
8fc85db9d2 | ||
|
|
3a60d06ad9 | ||
|
|
abd86ef175 |
83
.github/workflows/build-android.yml
vendored
83
.github/workflows/build-android.yml
vendored
@@ -40,13 +40,9 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# Disabled due to size (400MB) and always 0 cache hits
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.16
|
||||
# with:
|
||||
# key: android-build
|
||||
# evict-old-files: 1d
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: false
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v5
|
||||
@@ -66,10 +62,11 @@ jobs:
|
||||
|
||||
android-ndk:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
OPENCL_VERSION: 2025.07.22
|
||||
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
@@ -82,59 +79,23 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: false
|
||||
|
||||
- name: Install OpenCL Headers and Libs
|
||||
id: install_opencl
|
||||
if: ${{ matrix.build == 'arm64-snapdragon' }}
|
||||
run: |
|
||||
mkdir opencl
|
||||
curl -L -o opencl/clhpp.tar.gz https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
|
||||
curl -L -o opencl/headers.tar.gz https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
|
||||
curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
|
||||
tar -xaf opencl/headers.tar.gz -C opencl
|
||||
tar -xaf opencl/clhpp.tar.gz -C opencl
|
||||
tar -xaf opencl/icd-loader.tar.gz -C opencl
|
||||
sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
|
||||
sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL
|
||||
cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION}
|
||||
cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared
|
||||
cmake --build build
|
||||
sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
|
||||
rm -rf opencl
|
||||
|
||||
- name: Install Hexagon SDK
|
||||
id: install_hexsdk
|
||||
if: ${{ matrix.build == 'arm64-snapdragon' }}
|
||||
env:
|
||||
HEXSDK_VER: 6.4.0.2
|
||||
HEXTLS_VER: 19.0.04
|
||||
run: |
|
||||
curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz
|
||||
mkdir hex-sdk
|
||||
tar -xaf hex-sdk.tar.gz -C hex-sdk
|
||||
ls -l hex-sdk
|
||||
sudo mv hex-sdk /opt/hexagon
|
||||
echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER" >> "$GITHUB_ENV"
|
||||
echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER" >> "$GITHUB_ENV"
|
||||
echo "DEFAULT_HLOS_ARCH=64" >> "$GITHUB_ENV"
|
||||
echo "DEFAULT_TOOLS_VARIANT=toolv19" >> "$GITHUB_ENV"
|
||||
echo "DEFAULT_NO_QURT_INC=0" >> "$GITHUB_ENV"
|
||||
echo "DEFAULT_DSP_ARCH=v73" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Update CMake presets
|
||||
id: update_presets
|
||||
if: ${{ matrix.build == 'arm64-snapdragon' }}
|
||||
run: |
|
||||
cp docs/backend/snapdragon/CMakeUserPresets.json .
|
||||
|
||||
- name: Build
|
||||
id: ndk_build
|
||||
- name: Build Llama.CPP for Hexagon Android
|
||||
id: build_llama_cpp_hexagon_android
|
||||
run: |
|
||||
if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
|
||||
cp docs/backend/snapdragon/CMakeUserPresets.json .
|
||||
fi
|
||||
cmake ${{ matrix.defines }} -B build
|
||||
cmake --build build
|
||||
cmake --install build --prefix pkg-adb/llama.cpp
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
echo "FIXME: test on devices"
|
||||
- name: Upload Llama.CPP Hexagon Android Build Artifact
|
||||
if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: llama-cpp-android-${{ matrix.build }}
|
||||
path: pkg-adb/llama.cpp
|
||||
|
||||
109
.github/workflows/build-self-hosted.yml
vendored
109
.github/workflows/build-self-hosted.yml
vendored
@@ -141,60 +141,61 @@ jobs:
|
||||
# amd-smi static
|
||||
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-metal:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-webgpu:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dawn Dependency
|
||||
id: dawn-depends
|
||||
run: |
|
||||
DAWN_VERSION="v2.0.0"
|
||||
DAWN_OWNER="reeselevine"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
|
||||
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
curl -L -o artifact.zip \
|
||||
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
mkdir dawn
|
||||
unzip artifact.zip
|
||||
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-vulkan:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
# TODO: sandbox Mac runners
|
||||
# ggml-ci-mac-metal:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
#
|
||||
# ggml-ci-mac-webgpu:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Dawn Dependency
|
||||
# id: dawn-depends
|
||||
# run: |
|
||||
# DAWN_VERSION="v2.0.0"
|
||||
# DAWN_OWNER="reeselevine"
|
||||
# DAWN_REPO="dawn"
|
||||
# DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
|
||||
# echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
# curl -L -o artifact.zip \
|
||||
# "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
# mkdir dawn
|
||||
# unzip artifact.zip
|
||||
# tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
# bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
#
|
||||
# ggml-ci-mac-vulkan:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# vulkaninfo --summary
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-linux-intel-vulkan:
|
||||
runs-on: [self-hosted, Linux, Intel]
|
||||
|
||||
44
.github/workflows/build.yml
vendored
44
.github/workflows/build.yml
vendored
@@ -87,7 +87,7 @@ jobs:
|
||||
-DGGML_METAL_EMBED_LIBRARY=OFF \
|
||||
-DGGML_METAL_SHADER_DEBUG=ON \
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1
|
||||
|
||||
- name: Test
|
||||
@@ -124,7 +124,7 @@ jobs:
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
@@ -165,8 +165,8 @@ jobs:
|
||||
id: cmake_build
|
||||
run: |
|
||||
export CMAKE_PREFIX_PATH=dawn
|
||||
cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
@@ -231,7 +231,7 @@ jobs:
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
@@ -274,14 +274,16 @@ jobs:
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libssl-dev
|
||||
sudo apt-get install build-essential libssl-dev ninja-build
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
@@ -300,12 +302,13 @@ jobs:
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get install -y glslc libvulkan-dev libssl-dev
|
||||
sudo apt-get install -y glslc libvulkan-dev libssl-dev ninja-build
|
||||
|
||||
- name: Configure
|
||||
id: cmake_configure
|
||||
run: |
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
@@ -314,7 +317,7 @@ jobs:
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake --build build -j $(nproc)
|
||||
time cmake --build build -j $(nproc)
|
||||
|
||||
ubuntu-24-webgpu:
|
||||
runs-on: ubuntu-24.04
|
||||
@@ -336,7 +339,8 @@ jobs:
|
||||
run: |
|
||||
sudo add-apt-repository -y ppa:kisak/kisak-mesa
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers \
|
||||
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
|
||||
|
||||
- name: Get latest Vulkan SDK version
|
||||
id: vulkan_sdk_version
|
||||
@@ -378,7 +382,7 @@ jobs:
|
||||
export Dawn_DIR=dawn/lib64/cmake/Dawn
|
||||
cmake -B build \
|
||||
-DGGML_WEBGPU=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
@@ -415,11 +419,13 @@ jobs:
|
||||
run: |
|
||||
source emsdk/emsdk_env.sh
|
||||
emcmake cmake -B build-wasm \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_WEBGPU=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
|
||||
|
||||
cmake --build build-wasm --target test-backend-ops -j $(nproc)
|
||||
time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)
|
||||
|
||||
ubuntu-22-hip:
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -479,7 +485,7 @@ jobs:
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DGGML_MUSA=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-sycl:
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -528,7 +534,7 @@ jobs:
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-sycl-fp16:
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -551,7 +557,7 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
|
||||
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
|
||||
|
||||
- name: install oneAPI MKL library
|
||||
shell: bash
|
||||
@@ -574,11 +580,13 @@ jobs:
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DGGML_SYCL_F16=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-24-openvino:
|
||||
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
|
||||
@@ -648,7 +656,7 @@ jobs:
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON
|
||||
cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
time cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
@@ -1039,7 +1047,7 @@ jobs:
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
|
||||
1
.github/workflows/copilot-setup-steps.yml
vendored
1
.github/workflows/copilot-setup-steps.yml
vendored
@@ -54,4 +54,3 @@ jobs:
|
||||
python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
|
||||
pip install flake8 pyright pre-commit
|
||||
|
||||
4
.github/workflows/gguf-publish.yml
vendored
4
.github/workflows/gguf-publish.yml
vendored
@@ -28,11 +28,11 @@ jobs:
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.9.x'
|
||||
python-version: '3.11'
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
cd gguf-py
|
||||
python -m pip install poetry
|
||||
python -m pip install poetry==2.3.2
|
||||
poetry install
|
||||
|
||||
- name: Build package
|
||||
|
||||
6
.github/workflows/hip-quality-check.yml
vendored
6
.github/workflows/hip-quality-check.yml
vendored
@@ -8,7 +8,8 @@ on:
|
||||
paths: [
|
||||
'.github/workflows/hip-quality-check.yml',
|
||||
'**/*.cu',
|
||||
'**/*.cuh'
|
||||
'**/*.cuh',
|
||||
'scripts/hip/gcn-cdna-vgpr-check.py'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
@@ -16,7 +17,8 @@ on:
|
||||
paths: [
|
||||
'.github/workflows/hip-quality-check.yml',
|
||||
'**/*.cu',
|
||||
'**/*.cuh'
|
||||
'**/*.cuh',
|
||||
'scripts/hip/gcn-cdna-vgpr-check.py'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
|
||||
45
ci/run.sh
45
ci/run.sh
@@ -57,6 +57,13 @@ SRC=`pwd`
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
|
||||
CTEST_EXTRA=""
|
||||
|
||||
# Default to use make unless specified for compatibility
|
||||
CMAKE_GENERATOR="Unix Makefiles"
|
||||
|
||||
if [ ! -z "${GG_BUILD_NINJA}" ]; then
|
||||
CMAKE_GENERATOR="Ninja"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||
fi
|
||||
@@ -242,13 +249,13 @@ function gg_run_ctest_debug {
|
||||
|
||||
set -e
|
||||
|
||||
# Check cmake, make and ctest are installed
|
||||
# Check cmake and ctest are installed
|
||||
gg_check_build_requirements
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
(time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
}
|
||||
@@ -273,16 +280,16 @@ function gg_run_ctest_release {
|
||||
|
||||
set -e
|
||||
|
||||
# Check cmake, make and ctest are installed
|
||||
# Check cmake and ctest are installed
|
||||
gg_check_build_requirements
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
(time ctest --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest -C Release --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
else
|
||||
(time ctest --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest -C Release --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
fi
|
||||
|
||||
set +e
|
||||
@@ -340,7 +347,7 @@ function gg_run_ctest_with_model_debug {
|
||||
cd build-ci-debug
|
||||
set -e
|
||||
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest -C Debug --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
cd ..
|
||||
@@ -353,7 +360,7 @@ function gg_run_ctest_with_model_release {
|
||||
cd build-ci-release
|
||||
set -e
|
||||
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest -C Release --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
# test memory leaks
|
||||
#if [[ ! -z ${GG_BUILD_METAL} ]]; then
|
||||
@@ -407,8 +414,8 @@ function gg_run_qwen3_0_6b {
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf --outtype f16
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-bf16.gguf --outtype bf16
|
||||
@@ -556,8 +563,8 @@ function gg_run_embd_bge_small {
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
@@ -601,8 +608,8 @@ function gg_run_rerank_tiny {
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
@@ -652,10 +659,6 @@ function gg_check_build_requirements {
|
||||
gg_printf 'cmake not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v make &> /dev/null; then
|
||||
gg_printf 'make not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v ctest &> /dev/null; then
|
||||
gg_printf 'ctest not found, please install'
|
||||
fi
|
||||
|
||||
@@ -423,6 +423,9 @@ static bool parse_bool_value(const std::string & value) {
|
||||
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
||||
common_params & params = ctx_arg.params;
|
||||
|
||||
// setup log directly from params.verbosity: see tools/cli/cli.cpp
|
||||
common_log_set_verbosity_thold(params.verbosity);
|
||||
|
||||
std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
|
||||
for (auto & opt : ctx_arg.options) {
|
||||
for (const auto & arg : opt.args) {
|
||||
@@ -631,8 +634,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
));
|
||||
}
|
||||
|
||||
common_log_set_verbosity_thold(params.verbosity);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -3244,6 +3245,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
|
||||
[](common_params & params) {
|
||||
params.verbosity = INT_MAX;
|
||||
common_log_set_verbosity_thold(INT_MAX);
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
@@ -3264,6 +3266,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"(default: %d)\n", params.verbosity),
|
||||
[](common_params & params, int value) {
|
||||
params.verbosity = value;
|
||||
common_log_set_verbosity_thold(value);
|
||||
}
|
||||
).set_env("LLAMA_LOG_VERBOSITY"));
|
||||
add_opt(common_arg(
|
||||
|
||||
@@ -454,7 +454,9 @@ static gguf_split_info get_gguf_split_info(const std::string & path) {
|
||||
std::smatch m;
|
||||
|
||||
std::string prefix = path;
|
||||
string_remove_suffix(prefix, ".gguf");
|
||||
if (!string_remove_suffix(prefix, ".gguf")) {
|
||||
return {};
|
||||
}
|
||||
|
||||
int index = 1;
|
||||
int count = 1;
|
||||
|
||||
@@ -667,8 +667,9 @@ value macro_statement::execute_impl(context & ctx) {
|
||||
if (is_stmt<identifier>(this->args[i])) {
|
||||
// normal parameter
|
||||
std::string param_name = cast_stmt<identifier>(this->args[i])->val;
|
||||
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), args.get_pos(i)->type().c_str());
|
||||
macro_ctx.set_val(param_name, args.get_pos(i));
|
||||
value param_value = args.get_kwarg_or_pos(param_name, i);
|
||||
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
|
||||
macro_ctx.set_val(param_name, param_value);
|
||||
} else if (is_stmt<keyword_argument_expression>(this->args[i])) {
|
||||
// default argument used as normal parameter
|
||||
auto kwarg = cast_stmt<keyword_argument_expression>(this->args[i]);
|
||||
@@ -676,8 +677,9 @@ value macro_statement::execute_impl(context & ctx) {
|
||||
throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
|
||||
}
|
||||
std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
|
||||
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), args.get_pos(i)->type().c_str());
|
||||
macro_ctx.set_val(param_name, args.get_pos(i));
|
||||
value param_value = args.get_kwarg_or_pos(param_name, i);
|
||||
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
|
||||
macro_ctx.set_val(param_name, param_value);
|
||||
} else {
|
||||
throw std::runtime_error("Invalid parameter type in macro '" + name + "'");
|
||||
}
|
||||
|
||||
@@ -1503,6 +1503,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869":
|
||||
# ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601
|
||||
res = "kanana2"
|
||||
if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015":
|
||||
# ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B
|
||||
res = "f2llmv2"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -4572,7 +4575,7 @@ class Qwen2MoeModel(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("Qwen3ForCausalLM")
|
||||
@ModelBase.register("Qwen3ForCausalLM", "Qwen3Model")
|
||||
class Qwen3Model(Qwen2Model):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN3
|
||||
|
||||
|
||||
@@ -154,6 +154,7 @@ models = [
|
||||
{"name": "qwen35", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", },
|
||||
{"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
|
||||
{"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
|
||||
{"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
# OpenVINO Backend for llama.cpp
|
||||
[OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge.
|
||||
This document describes the [OpenVINO backend for llama.cpp](../../src/ggml-openvino), which enables hardware-accelerated inference on **Intel® CPUs, GPUs, and NPUs** while remaining compatible with the existing **GGUF model ecosystem**. The backend translates GGML compute graphs into OpenVINO graphs and leverages graph compilation, kernel fusion, and device-specific optimizations to improve inference performance on supported Intel hardware.
|
||||
|
||||
> [!NOTE]
|
||||
> Performance and memory optimizations, accuracy validation, broader quantization coverage, broader operator and model support are work in progress.
|
||||
|
||||
[OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. [OpenVINO backend for llama.cpp](../../src/ggml-openvino) enables hardware-accelerated inference on **Intel® CPUs, GPUs, and NPUs** while remaining compatible with the existing **GGUF model ecosystem**. The backend translates GGML compute graphs into OpenVINO graphs and leverages graph compilation, kernel fusion, and device-specific optimizations to improve inference performance on supported Intel hardware.
|
||||
|
||||
The OpenVINO backend is implemented in `ggml/src/ggml-openvino` and provides a translation layer for core GGML operations. The OpenVINO backend replaces the standard GGML graph execution path with Intel's OpenVINO inference engine. This approach allows the same GGUF model file to run on Intel CPUs, Intel GPUs (integrated and discrete), and Intel NPUs without changes to the model or the rest of the llama.cpp stack. When a `ggml_cgraph` is dispatched to OpenVINO backend, it:
|
||||
|
||||
@@ -179,31 +182,73 @@ curl -L https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/L
|
||||
|
||||
When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster.
|
||||
|
||||
> [!NOTE]
|
||||
> Default context size is set to the model training context, which may be very large. For example, 131072 for Llama 3.2 1B, which may result in lower performance, especially on edge/laptop devices. Use `-c` to limit context size in supported llama.cpp tools for better performance. For example, `-c 512`.
|
||||
|
||||
```bash
|
||||
# If device is unset or unavailable, defaults to CPU.
|
||||
# If the system has multiple GPUs, use GPU.0 or GPU.1 to explicitly target a specific GPU.
|
||||
|
||||
# Linux
|
||||
export GGML_OPENVINO_DEVICE=GPU
|
||||
# Enable stateful execution with GPU device to avoid known stateless execution failures.
|
||||
export GGML_OPENVINO_STATEFUL_EXECUTION=1
|
||||
# To run llama-simple:
|
||||
./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
|
||||
# To run in chat mode:
|
||||
./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||
./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024
|
||||
# To run llama-bench, -fa 1 is needed
|
||||
GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-bench -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -fa 1
|
||||
|
||||
# NPU: keep context small to avoid failures from very large model context windows.
|
||||
export GGML_OPENVINO_DEVICE=NPU
|
||||
./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 512
|
||||
|
||||
# Windows Command Line
|
||||
set GGML_OPENVINO_DEVICE=GPU
|
||||
# Enable stateful execution with GPU device to avoid known stateless execution failures.
|
||||
set GGML_OPENVINO_STATEFUL_EXECUTION=1
|
||||
# Windows PowerShell
|
||||
$env:GGML_OPENVINO_DEVICE = "GPU"
|
||||
$env:GGML_OPENVINO_STATEFUL_EXECUTION = "1"
|
||||
|
||||
# To run llama-simple
|
||||
build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
|
||||
# To run in chat mode:
|
||||
build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
|
||||
build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 1024
|
||||
# To run llama-bench, -fa 1 is needed
|
||||
build\ReleaseOV\bin\llama-bench.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -fa 1
|
||||
|
||||
# NPU: keep context small to avoid failures from very large model context windows.
|
||||
# Windows Command Line
|
||||
set GGML_OPENVINO_DEVICE=NPU
|
||||
# Windows PowerShell
|
||||
$env:GGML_OPENVINO_DEVICE = "NPU"
|
||||
build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 512
|
||||
```
|
||||
> [!NOTE]
|
||||
> On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html) for more details.
|
||||
|
||||
### Known Issues and Current Workarounds
|
||||
|
||||
- GPU stateless execution is currently affected by a known issue.
|
||||
- Workaround: set `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device.
|
||||
- NPU failures can happen when context size is too large. Recent llama.cpp behavior may resolve context size to the model training context (for example, 131072 for Llama 3.2 1B), which is too large for current NPU usage and can also stress laptop CPU/GPU on larger models. To inspect the selected context size, run `llama-cli` or `llama-server` with `-lv 3`.
|
||||
- Workaround: explicitly set context size, for ex. `-c 1024` for NPU runs. Performance will be better with lower context size.
|
||||
- Additional NPU limitations:
|
||||
- Model caching is not yet supported.
|
||||
- `llama-server -np > 1` (multiple parallel sequences) is not supported.
|
||||
- `llama-perplexity` is only supported with `-b 512` or smaller.
|
||||
- `--context-shift` with `llama-cli` is currently not supported with OpenVINO backend across CPU, GPU, and NPU devices.
|
||||
- Encoder models (embedding, reranking) are not supported with the current OpenVINO backend implementation.
|
||||
- `-fa 1` is required when running llama-bench with the OpenVINO backend.
|
||||
- `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
|
||||
- `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
|
||||
- For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile)
|
||||
|
||||
> [!NOTE]
|
||||
> The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
|
||||
|
||||
|
||||
### Docker Build
|
||||
|
||||
@@ -229,31 +274,42 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p
|
||||
Run llama.cpp with OpenVINO backend Docker container.
|
||||
Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.
|
||||
|
||||
> [!NOTE]
|
||||
> Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile).
|
||||
|
||||
```bash
|
||||
# Run Docker container
|
||||
docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||
docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||
|
||||
# With Intel GPU access (iGPU or dGPU)
|
||||
docker run --rm -it -v ~/models:/models \
|
||||
--device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
|
||||
llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||
--env=GGML_OPENVINO_DEVICE=GPU --env=GGML_OPENVINO_STATEFUL_EXECUTION=1 \
|
||||
llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||
|
||||
# With Intel NPU access
|
||||
docker run --rm -it --env GGML_OPENVINO_DEVICE=NPU -v ~/models:/models \
|
||||
docker run --rm -it -v ~/models:/models \
|
||||
--device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
|
||||
llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||
--env=GGML_OPENVINO_DEVICE=NPU \
|
||||
llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||
```
|
||||
|
||||
Run Llama.cpp Server with OpenVINO Backend:
|
||||
Run Llama.cpp Server with OpenVINO Backend.
|
||||
> [!NOTE]
|
||||
> `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
|
||||
|
||||
```bash
|
||||
# Run the Server Docker container
|
||||
docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||
|
||||
# In a NEW terminal, test the server with curl
|
||||
docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024
|
||||
# Or Using llama-server executable
|
||||
./build/ReleaseOV/bin/llama-server -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf --port 8080 -c 1024
|
||||
|
||||
# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
|
||||
export NO_PROXY=localhost,127.0.0.1
|
||||
|
||||
# Option 1: Open your browser to http://localhost:8080 to access the web UI for the llama.cpp server.
|
||||
# Option 2: In a NEW terminal, test the server with curl
|
||||
|
||||
# Test health endpoint
|
||||
curl -f http://localhost:8080/health
|
||||
|
||||
@@ -295,6 +351,7 @@ The OpenVINO backend can be configured using the following environment variables
|
||||
export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
|
||||
export GGML_OPENVINO_PROFILING=1
|
||||
export GGML_OPENVINO_DEVICE=GPU
|
||||
export GGML_OPENVINO_STATEFUL_EXECUTION=1
|
||||
|
||||
./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
|
||||
|
||||
@@ -302,38 +359,27 @@ export GGML_OPENVINO_DEVICE=GPU
|
||||
set GGML_OPENVINO_CACHE_DIR=C:\tmp\ov_cache
|
||||
set GGML_OPENVINO_PROFILING=1
|
||||
set GGML_OPENVINO_DEVICE=GPU
|
||||
set GGML_OPENVINO_STATEFUL_EXECUTION=1
|
||||
|
||||
# Windows PowerShell
|
||||
$env:GGML_OPENVINO_CACHE_DIR = "C:\tmp\ov_cache"
|
||||
$env:GGML_OPENVINO_PROFILING = "1"
|
||||
$env:GGML_OPENVINO_DEVICE = "GPU"
|
||||
$env:GGML_OPENVINO_STATEFUL_EXECUTION = "1"
|
||||
|
||||
build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
|
||||
|
||||
```
|
||||
|
||||
#### llama-bench
|
||||
|
||||
```bash
|
||||
# -fa 1 is required when running llama-bench with the OpenVINO backend.
|
||||
GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1
|
||||
```
|
||||
|
||||
### NPU Notes
|
||||
|
||||
- Model caching is not yet supported
|
||||
- Does not support llama-server -np > 1 (multiple parallel sequences)
|
||||
- Only supports llama-perplexity -b 512 or smaller
|
||||
|
||||
## Llama.cpp Tools
|
||||
|
||||
The following tools work with the OpenVINO backend on CPU, GPU, NPU:
|
||||
- llama-simple
|
||||
- llama-run
|
||||
- llama-cli
|
||||
- llama-server
|
||||
- llama-bench
|
||||
- llama-cli
|
||||
- llama-completion
|
||||
- llama-perplexity
|
||||
- llama-server
|
||||
- llama-simple
|
||||
|
||||
## Work in Progress
|
||||
|
||||
|
||||
@@ -365,13 +365,13 @@ Java_com_arm_aichat_internal_InferenceEngineImpl_processSystemPrompt(
|
||||
const auto *system_prompt = env->GetStringUTFChars(jsystem_prompt, nullptr);
|
||||
LOGd("%s: System prompt received: \n%s", __func__, system_prompt);
|
||||
std::string formatted_system_prompt(system_prompt);
|
||||
env->ReleaseStringUTFChars(jsystem_prompt, system_prompt);
|
||||
|
||||
// Format system prompt if applicable
|
||||
const bool has_chat_template = common_chat_templates_was_explicit(g_chat_templates.get());
|
||||
if (has_chat_template) {
|
||||
formatted_system_prompt = chat_add_and_format(ROLE_SYSTEM, system_prompt);
|
||||
}
|
||||
env->ReleaseStringUTFChars(jsystem_prompt, system_prompt);
|
||||
|
||||
// Tokenize system prompt
|
||||
const auto system_tokens = common_tokenize(g_context, formatted_system_prompt,
|
||||
@@ -414,13 +414,13 @@ Java_com_arm_aichat_internal_InferenceEngineImpl_processUserPrompt(
|
||||
const auto *const user_prompt = env->GetStringUTFChars(juser_prompt, nullptr);
|
||||
LOGd("%s: User prompt received: \n%s", __func__, user_prompt);
|
||||
std::string formatted_user_prompt(user_prompt);
|
||||
env->ReleaseStringUTFChars(juser_prompt, user_prompt);
|
||||
|
||||
// Format user prompt if applicable
|
||||
const bool has_chat_template = common_chat_templates_was_explicit(g_chat_templates.get());
|
||||
if (has_chat_template) {
|
||||
formatted_user_prompt = chat_add_and_format(ROLE_USER, user_prompt);
|
||||
}
|
||||
env->ReleaseStringUTFChars(juser_prompt, user_prompt);
|
||||
|
||||
// Decode formatted user prompts
|
||||
auto user_tokens = common_tokenize(g_context, formatted_user_prompt, has_chat_template, has_chat_template);
|
||||
|
||||
@@ -77,6 +77,7 @@ extern "C" {
|
||||
};
|
||||
|
||||
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||
GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params);
|
||||
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
||||
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
||||
|
||||
@@ -189,6 +190,7 @@ extern "C" {
|
||||
//
|
||||
|
||||
// write the entire context to a binary file
|
||||
GGML_API bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta);
|
||||
GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||
|
||||
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
||||
|
||||
@@ -773,6 +773,5 @@ inline bool ggml_check_edges(const struct ggml_cgraph * cgraph,
|
||||
|
||||
// expose GGUF internals for test code
|
||||
GGML_API size_t gguf_type_size(enum gguf_type type);
|
||||
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
||||
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
|
||||
#endif // __cplusplus
|
||||
|
||||
@@ -56,7 +56,7 @@ void ggml_sycl_add_id(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
float* dst_d = (float*)dst->data;
|
||||
|
||||
const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
|
||||
assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
||||
GGML_ASSERT(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
||||
|
||||
int threads = std::min((unsigned int)ne00, max_work_group_size); // cols
|
||||
|
||||
|
||||
@@ -394,7 +394,11 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
|
||||
return true;
|
||||
}
|
||||
|
||||
struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
|
||||
struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
|
||||
if (!file) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const struct gguf_reader gr(file);
|
||||
struct gguf_context * ctx = new gguf_context;
|
||||
|
||||
@@ -848,7 +852,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
struct gguf_context * result = gguf_init_from_file_impl(file, params);
|
||||
struct gguf_context * result = gguf_init_from_file_ptr(file, params);
|
||||
fclose(file);
|
||||
return result;
|
||||
}
|
||||
@@ -1508,6 +1512,19 @@ void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & bu
|
||||
gguf_write_out(ctx, gw, only_meta);
|
||||
}
|
||||
|
||||
bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta) {
|
||||
GGML_ASSERT(file);
|
||||
|
||||
try {
|
||||
gguf_writer_file gw(file);
|
||||
gguf_write_out(ctx, gw, only_meta);
|
||||
} catch (const std::runtime_error& ex) {
|
||||
GGML_LOG_ERROR("%s: failed to write GGUF data: %s\n", __func__, ex.what());
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
|
||||
FILE * file = ggml_fopen(fname, "wb");
|
||||
|
||||
@@ -1516,17 +1533,13 @@ bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, boo
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
gguf_writer_file gw(file);
|
||||
gguf_write_out(ctx, gw, only_meta);
|
||||
} catch (const std::runtime_error& ex) {
|
||||
GGML_LOG_ERROR("%s: failed to write GGUF data into '%s': %s\n", __func__, fname, ex.what());
|
||||
fclose(file);
|
||||
return false;
|
||||
const bool success = gguf_write_to_file_ptr(ctx, file, only_meta);
|
||||
if (!success) {
|
||||
GGML_LOG_ERROR("%s: failed to write GGUF data into '%s'\n", __func__, fname);
|
||||
}
|
||||
|
||||
fclose(file);
|
||||
return true;
|
||||
return success;
|
||||
}
|
||||
|
||||
size_t gguf_get_meta_size(const struct gguf_context * ctx) {
|
||||
|
||||
@@ -465,6 +465,11 @@ extern "C" {
|
||||
const char * path_model,
|
||||
struct llama_model_params params);
|
||||
|
||||
// Load a model from an open FILE pointer
|
||||
LLAMA_API struct llama_model * llama_model_load_from_file_ptr(
|
||||
FILE * file,
|
||||
struct llama_model_params params);
|
||||
|
||||
// Load a model from multiple splits (support custom naming scheme)
|
||||
// The paths must be in the correct order
|
||||
LLAMA_API struct llama_model * llama_model_load_from_splits(
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
docstring_parser~=0.15
|
||||
pydantic~=2.11.7
|
||||
requests
|
||||
requests~=2.32.3
|
||||
|
||||
@@ -2,37 +2,51 @@
|
||||
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
import re
|
||||
|
||||
|
||||
def parse_log_file(filepath):
|
||||
"""Parse log file and extract function VGPR usage."""
|
||||
import re
|
||||
|
||||
functions = defaultdict(lambda: {'vgprs': 0, 'spill': 0, 'location': ''})
|
||||
func_stack = []
|
||||
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
content = f.read()
|
||||
# Find all function entries with VGPR usage including location
|
||||
pattern = r'([^:]+:\d+):.*?Function Name: (\S+).*?VGPRs: (\d+).*?VGPRs Spill: (\d+)'
|
||||
matches = re.findall(pattern, content, re.DOTALL)
|
||||
for line in f:
|
||||
# Match function name lines
|
||||
func_match = re.search(r'remark: ([^:]+):(\d+):\d+: Function Name: (\S+)', line)
|
||||
if func_match:
|
||||
location = func_match.group(1) + ':' + func_match.group(2)
|
||||
func_name = func_match.group(3)
|
||||
# Extract just the filename and line number
|
||||
parts = location.split('/')
|
||||
short_location = parts[-1] if len(parts) > 0 else location
|
||||
functions[func_name]['location'] = short_location
|
||||
# Push function onto stack with its location
|
||||
func_stack.append({'name': func_name, 'location': location})
|
||||
continue
|
||||
|
||||
for location, func_name, vgprs, spill in matches:
|
||||
functions[func_name]['vgprs'] = int(vgprs)
|
||||
functions[func_name]['spill'] = int(spill)
|
||||
# Extract just the filename and line number
|
||||
parts = location.split('/')
|
||||
if len(parts) > 0:
|
||||
short_location = parts[-1] # Get last part (filename)
|
||||
# Check if there's a line number after filename
|
||||
if ':' in short_location:
|
||||
functions[func_name]['location'] = short_location
|
||||
else:
|
||||
functions[func_name]['location'] = location
|
||||
else:
|
||||
functions[func_name]['location'] = location
|
||||
# Match VGPR usage lines (only if we have functions in stack)
|
||||
vgpr_match = re.search(r'remark: ([^:]+):(\d+):\d+:\s+VGPRs: (\d+)', line)
|
||||
if vgpr_match:
|
||||
location = vgpr_match.group(1) + ':' + vgpr_match.group(2)
|
||||
# Find the most recent function with matching location
|
||||
for i in range(len(func_stack) - 1, -1, -1):
|
||||
if func_stack[i]['location'] == location:
|
||||
functions[func_stack[i]['name']]['vgprs'] = int(vgpr_match.group(3))
|
||||
break
|
||||
continue
|
||||
|
||||
spill_match = re.search(r'remark: ([^:]+):(\d+):\d+:\s+VGPRs Spill: (\d+)', line)
|
||||
if spill_match:
|
||||
location = spill_match.group(1) + ':' + spill_match.group(2)
|
||||
# Find the most recent function with matching location
|
||||
for i in range(len(func_stack) - 1, -1, -1):
|
||||
if func_stack[i]['location'] == location:
|
||||
functions[func_stack[i]['name']]['spill'] = int(spill_match.group(3))
|
||||
break
|
||||
continue
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File {filepath} not found", file=sys.stderr) # noqa: NP100
|
||||
print(f"Error: File {filepath} not found", file=sys.stderr) # noqa: NP100
|
||||
sys.exit(1)
|
||||
|
||||
return functions
|
||||
@@ -40,7 +54,7 @@ def parse_log_file(filepath):
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: ./vgpr_check.py <log_file>", file=sys.stderr) # noqa: NP100
|
||||
print("Usage: ./vgpr_check.py <log_file>", file=sys.stderr) # noqa: NP100
|
||||
sys.exit(1)
|
||||
|
||||
log_file = sys.argv[1]
|
||||
@@ -123,6 +137,9 @@ def main():
|
||||
'_ZL18flash_attn_ext_f16ILi128ELi128ELi32ELi2ELb1ELb0EEvPKcS1_S1_S1_S1_PKiPfP15HIP_vector_typeIfLj2EEffffjfiS5_IjLj3EEiiiiiiiiiiiliiliiiiil',
|
||||
'_ZL18flash_attn_ext_f16ILi128ELi128ELi4ELi8ELb1ELb0EEvPKcS1_S1_S1_S1_PKiPfP15HIP_vector_typeIfLj2EEffffjfiS5_IjLj3EEiiiiiiiiiiiliiliiiiil',
|
||||
'_ZL18flash_attn_ext_f16ILi96ELi96ELi4ELi8ELb0ELb0EEvPKcS1_S1_S1_S1_PKiPfP15HIP_vector_typeIfLj2EEffffjfiS5_IjLj3EEiiiiiiiiiiiliiliiiiil',
|
||||
'_ZL18flash_attn_ext_vecILi128ELi2EL9ggml_type2ELS0_2ELb0EEvPKcS2_S2_S2_S2_PKiPfP15HIP_vector_typeIfLj2EEffffjfiS6_IjLj3EEiiiiiiiiiiiliiliiiiil',
|
||||
'_ZL9mul_mat_qIL9ggml_type10ELi16ELb1EEvPKcPKiS4_S4_PfS5_iiiiiiiiiiiiiiiii',
|
||||
'_ZL9mul_mat_qIL9ggml_type12ELi128ELb1EEvPKcPKiS4_S4_PfS5_iiiiiiiiiiiiiiiii'
|
||||
}
|
||||
|
||||
functions = parse_log_file(log_file)
|
||||
@@ -134,7 +151,7 @@ def main():
|
||||
total_vgprs = int(data['vgprs']) + int(data['spill'])
|
||||
if total_vgprs > 256 and func_name in ignored and func_name not in printed_ignored:
|
||||
location = data.get('location', log_file)
|
||||
print(f"{location}: {func_name} - Total VGPRs: {total_vgprs} ({data['vgprs']} + {data['spill']}) [IGNORED]") # noqa: NP100
|
||||
print(f"{location}: {func_name} - Total VGPRs: {total_vgprs} ({data['vgprs']} + {data['spill']}) [IGNORED]") # noqa: NP100
|
||||
printed_ignored.add(func_name)
|
||||
|
||||
# Then print new functions with issues in red
|
||||
@@ -146,7 +163,7 @@ def main():
|
||||
# Print in red if not ignored
|
||||
color_code = "\033[91m" if func_name not in ignored else ""
|
||||
reset_code = "\033[0m" if func_name not in ignored else ""
|
||||
print(f"{color_code}{location}: {func_name} - Total VGPRs: {total_vgprs} ({data['vgprs']} + {data['spill']}) {status}{reset_code}") # noqa: NP100
|
||||
print(f"{color_code}{location}: {func_name} - Total VGPRs: {total_vgprs} ({data['vgprs']} + {data['spill']}) {status}{reset_code}") # noqa: NP100
|
||||
if func_name not in ignored:
|
||||
found_issues = True
|
||||
|
||||
|
||||
@@ -20,6 +20,14 @@ if ($null -ne $env:V) {
|
||||
$env:GGML_HEXAGON_VERBOSE=$env:V
|
||||
}
|
||||
|
||||
if ($null -ne $env:E) {
|
||||
$env:GGML_HEXAGON_EXPERIMENTAL=$env:E
|
||||
}
|
||||
|
||||
if ($null -ne $env:PROF) {
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
|
||||
}
|
||||
|
||||
if ($null -ne $env:OPMASK) {
|
||||
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
|
||||
}
|
||||
@@ -32,6 +40,10 @@ if ($null -ne $env:NDEV) {
|
||||
$env:GGML_HEXAGON_NDEV=$env:NDEV
|
||||
}
|
||||
|
||||
if ($null -ne $env:HB) {
|
||||
$env:GGML_HEXAGON_HOSTBUF=$env:HB
|
||||
}
|
||||
|
||||
$env:ADSP_LIBRARY_PATH="$basedir\lib"
|
||||
|
||||
& "$basedir\bin\llama-bench.exe" `
|
||||
|
||||
@@ -44,10 +44,14 @@ if ($null -ne $env:NDEV) {
|
||||
$env:GGML_HEXAGON_NDEV=$env:NDEV
|
||||
}
|
||||
|
||||
if ($null -ne $env:HB) {
|
||||
$env:GGML_HEXAGON_HOSTBUF=$env:HB
|
||||
}
|
||||
|
||||
$env:ADSP_LIBRARY_PATH="$basedir\lib"
|
||||
|
||||
& "$basedir\bin\llama-cli.exe" `
|
||||
--no-mmap -m $basedir\..\..\gguf\$model `
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
|
||||
--ctx-size 8192 --ubatch-size 128 -fa on `
|
||||
--ctx-size 8192 --ubatch-size 256 -fa on `
|
||||
-ngl 99 --device $device $cli_opts
|
||||
|
||||
@@ -44,10 +44,14 @@ if ($null -ne $env:NDEV) {
|
||||
$env:GGML_HEXAGON_NDEV=$env:NDEV
|
||||
}
|
||||
|
||||
if ($null -ne $env:HB) {
|
||||
$env:GGML_HEXAGON_HOSTBUF=$env:HB
|
||||
}
|
||||
|
||||
$env:ADSP_LIBRARY_PATH="$basedir\lib"
|
||||
|
||||
& "$basedir\bin\llama-completion.exe" `
|
||||
--no-mmap -m $basedir\..\..\gguf\$model `
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
|
||||
--ctx-size 8192 --batch-size 128 -fa on `
|
||||
--ctx-size 8192 --batch-size 256 -fa on `
|
||||
-ngl 99 -no-cnv --device $device $cli_opts
|
||||
|
||||
74
scripts/snapdragon/windows/run-mtmd.ps1
Normal file
74
scripts/snapdragon/windows/run-mtmd.ps1
Normal file
@@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env pwsh
|
||||
|
||||
# Basedir on device
|
||||
$basedir=".\pkg-snapdragon"
|
||||
|
||||
$cli_opts=$args
|
||||
|
||||
$model="gemma-3-4b-it-Q4_0.gguf"
|
||||
if ($null -ne $env:M) {
|
||||
$model=$env:M
|
||||
}
|
||||
|
||||
$mmproj="mmproj-F16.gguf"
|
||||
if ($null -ne $env:MMPROJ) {
|
||||
$mmproj=$env:MMPROJ
|
||||
}
|
||||
|
||||
$image=""
|
||||
if ($null -ne $env:IMG) {
|
||||
$image=$env:IMG
|
||||
}
|
||||
|
||||
$device="HTP0"
|
||||
if ($null -ne $env:D) {
|
||||
$device=$env:D
|
||||
}
|
||||
|
||||
if ($null -ne $env:V) {
|
||||
$env:GGML_HEXAGON_VERBOSE=$env:V
|
||||
}
|
||||
|
||||
# Default experimental to 1
|
||||
$env:GGML_HEXAGON_EXPERIMENTAL=1
|
||||
if ($null -ne $env:E) {
|
||||
$env:GGML_HEXAGON_EXPERIMENTAL=$env:E
|
||||
}
|
||||
|
||||
if ($null -ne $env:SCHED) {
|
||||
$env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v"
|
||||
}
|
||||
|
||||
if ($null -ne $env:PROF) {
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
|
||||
}
|
||||
|
||||
if ($null -ne $env:OPMASK) {
|
||||
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
|
||||
}
|
||||
|
||||
if ($null -ne $env:NHVX) {
|
||||
$env:GGML_HEXAGON_NHVX=$env:NHVX
|
||||
}
|
||||
|
||||
if ($null -ne $env:NDEV) {
|
||||
$env:GGML_HEXAGON_NDEV=$env:NDEV
|
||||
}
|
||||
|
||||
if ($null -ne $env:HB) {
|
||||
$env:GGML_HEXAGON_HOSTBUF=$env:HB
|
||||
}
|
||||
|
||||
if ($null -ne $env:MTMD_DEVICE) {
|
||||
$env:MTMD_BACKEND_DEVICE=$env:MTMD_DEVICE
|
||||
}
|
||||
|
||||
$env:ADSP_LIBRARY_PATH="$basedir\lib"
|
||||
|
||||
& "$basedir\bin\llama-mtmd-cli.exe" `
|
||||
--no-mmap -m $basedir\..\..\gguf\$model `
|
||||
--mmproj $basedir\..\..\gguf\$mmproj `
|
||||
--image $basedir\..\..\gguf\$image `
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
|
||||
--ctx-size 8192 --ubatch-size 256 -fa on `
|
||||
-ngl 99 --device $device -v $cli_opts
|
||||
@@ -50,6 +50,10 @@ if ($null -ne $env:NDEV) {
|
||||
$env:GGML_HEXAGON_NDEV=$env:NDEV
|
||||
}
|
||||
|
||||
if ($null -ne $env:HB) {
|
||||
$env:GGML_HEXAGON_HOSTBUF=$env:HB
|
||||
}
|
||||
|
||||
$env:ADSP_LIBRARY_PATH="$basedir\lib"
|
||||
|
||||
& "$basedir\bin\$tool" `
|
||||
|
||||
@@ -544,6 +544,10 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
case LLM_ARCH_CLIP:
|
||||
return {};
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_REFACT:
|
||||
case LLM_ARCH_MINICPM:
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_DECI:
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
case LLM_ARCH_LLAMA_EMBED:
|
||||
@@ -744,11 +748,9 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
};
|
||||
case LLM_ARCH_REFACT:
|
||||
case LLM_ARCH_QWEN2:
|
||||
case LLM_ARCH_QWEN2VL:
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_ERNIE4_5:
|
||||
case LLM_ARCH_PADDLEOCR:
|
||||
case LLM_ARCH_SMOLLM3:
|
||||
@@ -759,6 +761,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
@@ -1232,29 +1235,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
};
|
||||
case LLM_ARCH_MINICPM:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_ROPE_FACTORS_LONG,
|
||||
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_ATTN_ROT_EMBD,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_GATE_EXP,
|
||||
LLM_TENSOR_FFN_DOWN_EXP,
|
||||
LLM_TENSOR_FFN_UP_EXP,
|
||||
};
|
||||
case LLM_ARCH_MINICPM3:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
@@ -1442,6 +1422,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
@@ -1657,7 +1638,9 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_QKV,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
@@ -2061,30 +2044,12 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
};
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
};
|
||||
case LLM_ARCH_GRANITE_HYBRID:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_SSM_IN,
|
||||
LLM_TENSOR_SSM_CONV1D,
|
||||
@@ -2412,6 +2377,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_QKV,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
@@ -2789,7 +2755,12 @@ std::string LLM_TN_IMPL::str() const {
|
||||
}
|
||||
|
||||
if (model_tensors.find(tensor) == model_tensors.end()) {
|
||||
return LLM_TENSOR_NAMES.at(tensor);
|
||||
const char * name = LLM_TENSOR_NAMES.at(tensor);
|
||||
if (suffix != nullptr || bid != -1 || xid != -1) {
|
||||
LLAMA_LOG_WARN("%s: cannot properly format tensor name %s with suffix=%s bid=%d xid=%d\n",
|
||||
__func__, name, suffix, bid, xid);
|
||||
}
|
||||
return name;
|
||||
}
|
||||
|
||||
std::string name = ::format(LLM_TENSOR_NAMES.at(tensor), bid, xid);
|
||||
|
||||
@@ -86,6 +86,14 @@ struct llama_file::impl {
|
||||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
impl(FILE * file) : owns_fp(false) {
|
||||
fp = file;
|
||||
fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
LARGE_INTEGER li;
|
||||
li.QuadPart = 0;
|
||||
@@ -159,7 +167,7 @@ struct llama_file::impl {
|
||||
}
|
||||
|
||||
~impl() {
|
||||
if (fp) {
|
||||
if (fp && owns_fp) {
|
||||
std::fclose(fp);
|
||||
}
|
||||
}
|
||||
@@ -209,6 +217,13 @@ struct llama_file::impl {
|
||||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
impl(FILE * file) : fname("(file*)"), owns_fp(false) {
|
||||
fp = file;
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
if (fd == -1) {
|
||||
long ret = std::ftell(fp);
|
||||
@@ -353,7 +368,7 @@ struct llama_file::impl {
|
||||
~impl() {
|
||||
if (fd != -1) {
|
||||
close(fd);
|
||||
} else {
|
||||
} else if (owns_fp) {
|
||||
std::fclose(fp);
|
||||
}
|
||||
}
|
||||
@@ -369,10 +384,14 @@ struct llama_file::impl {
|
||||
|
||||
FILE * fp{};
|
||||
size_t size{};
|
||||
bool owns_fp = true;
|
||||
};
|
||||
|
||||
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
|
||||
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
|
||||
|
||||
llama_file::llama_file(FILE * file) : pimpl(std::make_unique<impl>(file)) {}
|
||||
|
||||
llama_file::~llama_file() = default;
|
||||
|
||||
size_t llama_file::tell() const { return pimpl->tell(); }
|
||||
|
||||
@@ -15,6 +15,7 @@ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
||||
|
||||
struct llama_file {
|
||||
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
|
||||
llama_file(FILE * file);
|
||||
~llama_file();
|
||||
|
||||
size_t tell() const;
|
||||
|
||||
@@ -511,6 +511,7 @@ llama_model_loader::llama_model_loader(
|
||||
void * set_tensor_data_ud,
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits,
|
||||
FILE * file,
|
||||
bool use_mmap,
|
||||
bool use_direct_io,
|
||||
bool check_tensors,
|
||||
@@ -658,6 +659,36 @@ llama_model_loader::llama_model_loader(
|
||||
|
||||
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
||||
}
|
||||
} else if (file != nullptr) {
|
||||
struct ggml_context * ctx = NULL;
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
|
||||
metadata_ptr.reset(gguf_init_from_file_ptr(file, params));
|
||||
metadata = metadata_ptr.get();
|
||||
if (metadata == nullptr) {
|
||||
throw std::runtime_error(format("%s: failed to load model from file pointer", __func__));
|
||||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
files.emplace_back(new llama_file(file));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
// Save tensors data offset info of the main file.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string tensor_name = std::string(cur->name);
|
||||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
}
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
|
||||
}
|
||||
} else {
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
@@ -669,7 +700,7 @@ llama_model_loader::llama_model_loader(
|
||||
fver = (enum llama_fver) gguf_get_version(metadata);
|
||||
|
||||
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
||||
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
|
||||
__func__, n_kv, n_tensors, fname.empty() ? "(file*)" : fname.c_str(), llama_file_version_name(fver));
|
||||
|
||||
// determine file type based on the number of tensors for each quantization and print meta data
|
||||
// TODO: make optional
|
||||
|
||||
@@ -125,6 +125,7 @@ struct llama_model_loader {
|
||||
void * set_tensor_data_ud,
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
FILE * file,
|
||||
bool use_mmap,
|
||||
bool use_direct_io,
|
||||
bool check_tensors,
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
#include "llama-model-saver.h"
|
||||
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include "llama-arch.h"
|
||||
#include "llama.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-model.h"
|
||||
@@ -10,8 +12,33 @@
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
bool llama_model_saver_supports_arch(llm_arch arch) {
|
||||
switch (arch) {
|
||||
case LLM_ARCH_QWEN3NEXT:
|
||||
case LLM_ARCH_QWEN35:
|
||||
case LLM_ARCH_QWEN35MOE:
|
||||
case LLM_ARCH_PLAMO3:
|
||||
case LLM_ARCH_GEMMA3:
|
||||
case LLM_ARCH_GEMMA3N:
|
||||
case LLM_ARCH_COHERE2:
|
||||
case LLM_ARCH_OLMO2:
|
||||
case LLM_ARCH_BITNET:
|
||||
case LLM_ARCH_T5:
|
||||
case LLM_ARCH_EXAONE_MOE:
|
||||
case LLM_ARCH_AFMOE:
|
||||
case LLM_ARCH_APERTUS:
|
||||
case LLM_ARCH_MIMO2:
|
||||
case LLM_ARCH_STEP35:
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
llama_model_saver::llama_model_saver(const struct llama_model * model) :
|
||||
gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {}
|
||||
gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {
|
||||
GGML_ASSERT(llama_model_saver_supports_arch(model->arch));
|
||||
}
|
||||
|
||||
llama_model_saver::llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx) :
|
||||
gguf_ctx(gguf_ctx == nullptr ? gguf_init_empty() : gguf_ctx), gguf_ctx_owned(gguf_ctx == nullptr), model(nullptr), llm_kv(arch) {}
|
||||
@@ -105,7 +132,10 @@ void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
|
||||
return;
|
||||
}
|
||||
if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) {
|
||||
GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME
|
||||
const std::string tensor_name = tensor->name;
|
||||
GGML_ASSERT(
|
||||
tensor_name == "rope_freqs.weight" || tensor_name == "rope_factors_long.weight" ||
|
||||
tensor_name == "rope_factors_short.weight"); // FIXME
|
||||
return;
|
||||
}
|
||||
gguf_add_tensor(gguf_ctx, tensor);
|
||||
@@ -127,6 +157,7 @@ void llama_model_saver::add_kv_from_model() {
|
||||
tokens[id] = token_data.text;
|
||||
scores[id] = token_data.score;
|
||||
|
||||
// FIXME should this be treated as flags?
|
||||
switch(token_data.attr) {
|
||||
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
|
||||
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
|
||||
@@ -134,6 +165,9 @@ void llama_model_saver::add_kv_from_model() {
|
||||
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
|
||||
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
|
||||
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
|
||||
// case LLAMA_TOKEN_ATTR_NORMALIZED: ???
|
||||
// case LLAMA_TOKEN_ATTR_LSTRIP: ???
|
||||
// case LLAMA_TOKEN_ATTR_RSTRIP: ???
|
||||
case LLAMA_TOKEN_ATTR_UNDEFINED:
|
||||
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
|
||||
}
|
||||
@@ -144,6 +178,19 @@ void llama_model_saver::add_kv_from_model() {
|
||||
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model->arch_name());
|
||||
// add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
|
||||
// add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
|
||||
// add_kv(LLM_KV_GENERAL_FILE_TYPE, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_SEQUENCE, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_TOP_K, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_TOP_P, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_MIN_P, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_TEMP, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_MIROSTAT, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA, ???);
|
||||
add_kv(LLM_KV_GENERAL_NAME, model->name);
|
||||
// add_kv(LLM_KV_GENERAL_AUTHOR, ???);
|
||||
// add_kv(LLM_KV_GENERAL_VERSION, ???);
|
||||
@@ -163,17 +210,31 @@ void llama_model_saver::add_kv_from_model() {
|
||||
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
|
||||
add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
|
||||
add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
|
||||
add_kv(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp);
|
||||
add_kv(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp);
|
||||
add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
||||
// add_kv(LLM_KV_TENSOR_DATA_LAYOUT, ???);
|
||||
add_kv(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
||||
add_kv(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
||||
add_kv(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
add_kv(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups);
|
||||
add_kv(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used);
|
||||
add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
add_kv(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm);
|
||||
add_kv(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
||||
add_kv(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
|
||||
add_kv(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
|
||||
add_kv(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers);
|
||||
add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers);
|
||||
add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers);
|
||||
add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type));
|
||||
add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id);
|
||||
add_kv(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer);
|
||||
add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping);
|
||||
add_kv(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping);
|
||||
add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping);
|
||||
add_kv(LLM_KV_SWIN_NORM, hparams.swin_norm);
|
||||
add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers);
|
||||
@@ -181,6 +242,9 @@ void llama_model_saver::add_kv_from_model() {
|
||||
add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
|
||||
add_kv(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
||||
add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
||||
add_kv(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count);
|
||||
add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
||||
// add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, ???);
|
||||
|
||||
add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true);
|
||||
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
|
||||
@@ -188,22 +252,39 @@ void llama_model_saver::add_kv_from_model() {
|
||||
add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
|
||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
add_kv(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
|
||||
add_kv(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
||||
add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
||||
add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
||||
add_kv(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay);
|
||||
add_kv(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
|
||||
add_kv(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate);
|
||||
add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
||||
add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
// add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, ???);
|
||||
add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
||||
add_kv(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale);
|
||||
add_kv(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length);
|
||||
add_kv(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
|
||||
add_kv(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
|
||||
add_kv(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
|
||||
add_kv(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k);
|
||||
|
||||
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
|
||||
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full);
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa);
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections);
|
||||
add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
|
||||
add_kv(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
||||
// add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
|
||||
add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
|
||||
add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor);
|
||||
@@ -211,6 +292,10 @@ void llama_model_saver::add_kv_from_model() {
|
||||
add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn);
|
||||
add_kv(LLM_KV_ROPE_SCALING_FINETUNED, hparams.rope_finetuned);
|
||||
add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
||||
add_kv(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor);
|
||||
add_kv(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor);
|
||||
add_kv(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast);
|
||||
add_kv(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow);
|
||||
|
||||
// TODO: implement split file support
|
||||
// add_kv(LLM_KV_SPLIT_NO, ???);
|
||||
@@ -221,8 +306,11 @@ void llama_model_saver::add_kv_from_model() {
|
||||
add_kv(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||||
add_kv(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
||||
add_kv(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
||||
add_kv(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
||||
add_kv(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms);
|
||||
|
||||
add_kv(LLM_KV_KDA_HEAD_DIM, hparams.n_embd_head_kda);
|
||||
|
||||
add_kv(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
||||
|
||||
add_kv(LLM_KV_TOKENIZER_MODEL, vocab.get_tokenizer_model());
|
||||
@@ -260,15 +348,39 @@ void llama_model_saver::add_kv_from_model() {
|
||||
// TODO: implement LoRA support
|
||||
// add_kv(LLM_KV_ADAPTER_TYPE, ???);
|
||||
// add_kv(LLM_KV_ADAPTER_LORA_ALPHA, ???);
|
||||
// add_kv(LLM_KV_ADAPTER_LORA_TASK_NAME, ???);
|
||||
// add_kv(LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, ???);
|
||||
// add_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, ???);
|
||||
|
||||
add_kv(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
|
||||
add_kv(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
|
||||
|
||||
add_kv(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
|
||||
add_kv(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
|
||||
|
||||
add_kv(LLM_KV_CLASSIFIER_OUTPUT_LABELS, model->classifier_labels);
|
||||
|
||||
add_kv(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
||||
|
||||
add_kv(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n);
|
||||
add_kv(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p);
|
||||
add_kv(LLM_KV_XIELU_BETA, hparams.xielu_beta);
|
||||
add_kv(LLM_KV_XIELU_EPS, hparams.xielu_eps);
|
||||
|
||||
// deprecated
|
||||
// add_kv(LLM_KV_TOKENIZER_PREFIX_ID, ???);
|
||||
// add_kv(LLM_KV_TOKENIZER_SUFFIX_ID, ???);
|
||||
// add_kv(LLM_KV_TOKENIZER_MIDDLE_ID, ???);
|
||||
|
||||
add_kv(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in);
|
||||
add_kv(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out);
|
||||
add_kv(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in);
|
||||
add_kv(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out);
|
||||
}
|
||||
|
||||
void llama_model_saver::add_tensors_from_model() {
|
||||
if (std::string(model->output->name) != std::string(model->tok_embd->name)) {
|
||||
if (model->output != nullptr &&
|
||||
std::string(model->output->name) != std::string(model->tok_embd->name)) {
|
||||
add_tensor(model->tok_embd); // some models use the same tensor for tok_embd and output
|
||||
}
|
||||
add_tensor(model->type_embd);
|
||||
@@ -297,3 +409,6 @@ void llama_model_saver::save(const std::string & path_model) {
|
||||
gguf_write_to_file(gguf_ctx, path_model.c_str(), false);
|
||||
}
|
||||
|
||||
void llama_model_saver::save(FILE * file) {
|
||||
gguf_write_to_file_ptr(gguf_ctx, file, false);
|
||||
}
|
||||
|
||||
@@ -6,6 +6,9 @@
|
||||
|
||||
#include <vector>
|
||||
|
||||
// FIXME temporary function for better error messages
|
||||
bool llama_model_saver_supports_arch(llm_arch arch);
|
||||
|
||||
struct llama_model_saver {
|
||||
struct gguf_context * gguf_ctx = nullptr;
|
||||
const bool gguf_ctx_owned;
|
||||
@@ -37,4 +40,5 @@ struct llama_model_saver {
|
||||
void add_tensors_from_model();
|
||||
|
||||
void save(const std::string & path_model);
|
||||
void save(FILE * file);
|
||||
};
|
||||
|
||||
@@ -370,6 +370,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
@@ -748,8 +750,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case LLM_ARCH_BERT:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 3:
|
||||
@@ -781,8 +781,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 12:
|
||||
@@ -797,8 +795,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case LLM_ARCH_JINA_BERT_V2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
hparams.f_max_alibi_bias = 8.0f;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
@@ -810,8 +806,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case LLM_ARCH_JINA_BERT_V3:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 24:
|
||||
@@ -823,8 +817,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
||||
|
||||
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
||||
@@ -838,8 +830,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
if (hparams.n_layer == 28) {
|
||||
type = LLM_TYPE_250M;
|
||||
@@ -848,8 +838,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case LLM_ARCH_EUROBERT:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
if (hparams.n_layer == 12) {
|
||||
type = LLM_TYPE_SMALL; // 0.2B
|
||||
@@ -913,7 +901,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
// fall through
|
||||
case LLM_ARCH_QWEN2:
|
||||
{
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
|
||||
@@ -995,7 +982,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
} break;
|
||||
case LLM_ARCH_QWEN3:
|
||||
{
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
||||
@@ -1287,7 +1273,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
//applied only if model converted with --sentence-transformers-dense-modules
|
||||
ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
|
||||
@@ -1624,7 +1609,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
// (optional) temperature tuning - used by mistral-large
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false); // FIXME why not use temperature_length?
|
||||
|
||||
hparams.f_attn_temp_offset = 0.0f;
|
||||
|
||||
@@ -2084,7 +2069,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
} break;
|
||||
case LLM_ARCH_BAILINGMOE:
|
||||
{
|
||||
@@ -7607,14 +7591,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
buf_map.emplace(idx, buf);
|
||||
}
|
||||
}
|
||||
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
|
||||
|
||||
for (auto & buf : buf_map) {
|
||||
for (auto & buf : bufs) {
|
||||
// indicate that this buffer contains weights
|
||||
// this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
|
||||
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
ggml_backend_buffer_set_usage(buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
}
|
||||
|
||||
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
|
||||
|
||||
ctx_buf_maps.emplace_back(ctx, buf_map);
|
||||
}
|
||||
|
||||
|
||||
@@ -859,7 +859,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
|
||||
std::vector<std::string> splits = {};
|
||||
llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
|
||||
fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model(llama_model_default_params());
|
||||
|
||||
@@ -1952,7 +1952,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
} else if (
|
||||
tokenizer_pre == "qwen2" ||
|
||||
tokenizer_pre == "deepseek-r1-qwen" ||
|
||||
tokenizer_pre == "kormo") {
|
||||
tokenizer_pre == "kormo" ||
|
||||
tokenizer_pre == "f2llmv2") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
|
||||
@@ -828,7 +828,7 @@ int64_t llama_time_us(void) {
|
||||
|
||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
||||
static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
|
||||
const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
|
||||
const std::string & fname, std::vector<std::string> & splits, FILE * file, llama_model & model, llama_model_params & params) {
|
||||
// loading time will be recalculated after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
model.t_load_us = 0;
|
||||
@@ -837,7 +837,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens
|
||||
model.t_start_us = tm.t_start_us;
|
||||
|
||||
try {
|
||||
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
|
||||
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io,
|
||||
params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
|
||||
ml.print_info();
|
||||
@@ -889,8 +889,24 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||
void * set_tensor_data_ud,
|
||||
const std::string & path_model,
|
||||
std::vector<std::string> & splits,
|
||||
FILE * file,
|
||||
struct llama_model_params params) {
|
||||
GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
|
||||
{
|
||||
int n_sources_defined = 0;
|
||||
if (metadata != nullptr) {
|
||||
n_sources_defined++;
|
||||
}
|
||||
if (!path_model.empty()) {
|
||||
n_sources_defined++;
|
||||
}
|
||||
if (file != nullptr) {
|
||||
n_sources_defined++;
|
||||
}
|
||||
if (n_sources_defined != 1) {
|
||||
LLAMA_LOG_ERROR("%s: exactly one out metadata, path_model, and file must be defined\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
ggml_time_init();
|
||||
|
||||
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
|
||||
@@ -1011,7 +1027,7 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||
props.memory_free/1024/1024);
|
||||
}
|
||||
|
||||
const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
|
||||
const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, file, *model, params);
|
||||
GGML_ASSERT(status <= 0);
|
||||
if (status < 0) {
|
||||
if (status == -1) {
|
||||
@@ -1037,7 +1053,7 @@ struct llama_model * llama_model_init_from_user(
|
||||
std::vector<std::string> splits = {};
|
||||
params.use_mmap = false;
|
||||
params.use_extra_bufts = false;
|
||||
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
|
||||
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*file*/ nullptr, params);
|
||||
}
|
||||
// deprecated
|
||||
struct llama_model * llama_load_model_from_file(
|
||||
@@ -1050,7 +1066,7 @@ struct llama_model * llama_model_load_from_file(
|
||||
const char * path_model,
|
||||
struct llama_model_params params) {
|
||||
std::vector<std::string> splits = {};
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*file*/ nullptr, params);
|
||||
}
|
||||
|
||||
struct llama_model * llama_model_load_from_splits(
|
||||
@@ -1066,7 +1082,17 @@ struct llama_model * llama_model_load_from_splits(
|
||||
for (size_t i = 0; i < n_paths; ++i) {
|
||||
splits.push_back(paths[i]);
|
||||
}
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*file*/ nullptr, params);
|
||||
}
|
||||
|
||||
struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params) {
|
||||
if (!file) {
|
||||
LLAMA_LOG_ERROR("%s: file is NULL\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
std::string path_model;
|
||||
std::vector<std::string> splits = {};
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, file, params);
|
||||
}
|
||||
|
||||
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
||||
|
||||
@@ -742,7 +742,7 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
||||
/*ctx =*/ hft >= offset_has_data ? &ctx : nullptr,
|
||||
};
|
||||
|
||||
struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params);
|
||||
struct gguf_context * gguf_ctx = gguf_init_from_file_ptr(file, gguf_params);
|
||||
|
||||
if (expect_context_not_null(hft)) {
|
||||
printf("%s: - context_not_null: ", __func__);
|
||||
@@ -1125,19 +1125,15 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
|
||||
GGML_ASSERT(file);
|
||||
#endif // _WIN32
|
||||
|
||||
{
|
||||
std::vector<int8_t> buf;
|
||||
gguf_write_to_buf(gguf_ctx_0, buf, only_meta);
|
||||
GGML_ASSERT(fwrite(buf.data(), 1, buf.size(), file) == buf.size());
|
||||
rewind(file);
|
||||
}
|
||||
gguf_write_to_file_ptr(gguf_ctx_0, file, only_meta);
|
||||
rewind(file);
|
||||
|
||||
struct ggml_context * ctx_1 = nullptr;
|
||||
struct gguf_init_params gguf_params = {
|
||||
/*no_alloc =*/ false,
|
||||
/*ctx =*/ only_meta ? nullptr : &ctx_1,
|
||||
};
|
||||
struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params);
|
||||
struct gguf_context * gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params);
|
||||
|
||||
printf("%s: same_version: ", __func__);
|
||||
if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) {
|
||||
|
||||
@@ -884,6 +884,24 @@ static void test_macros(testing & t) {
|
||||
json::object(),
|
||||
"Hi Guest"
|
||||
);
|
||||
|
||||
test_template(t, "macro kwargs input",
|
||||
"{% macro my_func(a, b=False) %}{% if b %}{{ a }}{% else %}nope{% endif %}{% endmacro %}{{ my_func(1, b=True) }}",
|
||||
json::object(),
|
||||
"1"
|
||||
);
|
||||
|
||||
test_template(t, "macro with multiple args",
|
||||
"{% macro add(a, b, c=0) %}{{ a + b + c }}{% endmacro %}{{ add(1, 2) }},{{ add(1, 2, 3) }},{{ add(1, b=10) }},{{ add(1, 2, c=5) }}",
|
||||
json::object(),
|
||||
"3,6,11,8"
|
||||
);
|
||||
|
||||
test_template(t, "macro with kwarg out-of-order input",
|
||||
"{% macro greet(first, last, greeting='Hello') %}{{ greeting }}, {{ first }} {{ last }}{% endmacro %}{{ greet(last='Smith', first='John') }},{{ greet(last='Doe', greeting='Hi', first='Jane') }}",
|
||||
json::object(),
|
||||
"Hello, John Smith,Hi, Jane Doe"
|
||||
);
|
||||
}
|
||||
|
||||
static void test_namespace(testing & t) {
|
||||
|
||||
@@ -90,6 +90,7 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
|
||||
n_embd = 64;
|
||||
n_head = 1;
|
||||
n_ff = 96;
|
||||
n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded
|
||||
} else if (arch == LLM_ARCH_DEEPSEEK2
|
||||
|| arch == LLM_ARCH_GLM_DSA
|
||||
|| arch == LLM_ARCH_KIMI_LINEAR
|
||||
@@ -101,8 +102,6 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
|
||||
n_layer = 3;
|
||||
} else if (arch == LLM_ARCH_CHAMELEON) {
|
||||
n_vocab = 10240;
|
||||
} else if (arch == LLM_ARCH_GEMMA3N) {
|
||||
n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded
|
||||
}
|
||||
|
||||
const uint32_t n_embd_head = n_embd / n_head;
|
||||
@@ -231,9 +230,15 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool silent_model_load_progress(float /*progress*/, void * /*user_data*/) {
|
||||
return true;
|
||||
}
|
||||
|
||||
static std::pair<llama_model_ptr, llama_context_ptr> get_model_and_ctx(
|
||||
struct gguf_context * gguf_ctx, const size_t seed, const std::vector<ggml_backend_dev_t> & devs) {
|
||||
struct gguf_context * gguf_ctx, FILE * file, const size_t seed, const std::vector<ggml_backend_dev_t> & devs) {
|
||||
GGML_ASSERT((gguf_ctx == nullptr) != (file == nullptr));
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.progress_callback = silent_model_load_progress;
|
||||
std::vector<ggml_backend_dev_t> devs_copy = devs;
|
||||
devs_copy.push_back(nullptr);
|
||||
model_params.devices = devs_copy.data();
|
||||
@@ -244,7 +249,9 @@ static std::pair<llama_model_ptr, llama_context_ptr> get_model_and_ctx(
|
||||
ctx_params.n_threads_batch = 4;
|
||||
|
||||
size_t tmp = seed;
|
||||
llama_model_ptr model(llama_model_init_from_user(gguf_ctx, set_tensor_data, &tmp, model_params));
|
||||
llama_model_ptr model(gguf_ctx != nullptr ?
|
||||
llama_model_init_from_user(gguf_ctx, set_tensor_data, &tmp, model_params) :
|
||||
llama_model_load_from_file_ptr(file, model_params));
|
||||
if (!model) {
|
||||
throw std::runtime_error("failed to create llama model");
|
||||
}
|
||||
@@ -351,7 +358,6 @@ static bool moe_implemented(const llm_arch arch) {
|
||||
}
|
||||
|
||||
static int save_models(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level, const std::string & dir) {
|
||||
GGML_ABORT("llama_model_save_to_file is broken");
|
||||
struct user_data_t {
|
||||
struct {
|
||||
ggml_log_callback callback;
|
||||
@@ -376,6 +382,19 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
|
||||
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
|
||||
continue; // These models don't have usable implementations.
|
||||
}
|
||||
if (arch == LLM_ARCH_CHAMELEON) {
|
||||
continue; // Only half-implemented and to be removed in the future.
|
||||
}
|
||||
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
|
||||
continue; // FIXME
|
||||
}
|
||||
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
|
||||
arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
|
||||
continue; // TODO vocab
|
||||
}
|
||||
if (arch == LLM_ARCH_PLM) {
|
||||
continue; // TODO tensor shapes
|
||||
}
|
||||
for (bool moe : {false, true}) {
|
||||
if (moe && !moe_implemented(arch)) {
|
||||
continue;
|
||||
@@ -383,8 +402,12 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
|
||||
if (!moe && moe_mandatory(arch)) {
|
||||
continue;
|
||||
}
|
||||
if (!llama_model_saver_supports_arch(arch)) {
|
||||
LOG_INF("%s: %s model (%s) is unsupported, skipping\n", __func__, llm_arch_name(arch), moe ? "MoE" : "dense");
|
||||
continue;
|
||||
}
|
||||
gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
|
||||
auto model_and_ctx = get_model_and_ctx(gguf_ctx.get(), seed, {});
|
||||
auto model_and_ctx = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {});
|
||||
const std::string path = dir + "/" + llm_arch_name(arch) + (moe ? "-moe.gguf" : "-dense.gguf");
|
||||
LOG_INF("%s: Saving %s model (%s) to %s...\n", __func__, llm_arch_name(arch), moe ? "MoE" : "dense", path.c_str());
|
||||
llama_model_save_to_file(model_and_ctx.first.get(), path.c_str());
|
||||
@@ -416,8 +439,8 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
|
||||
|
||||
bool all_ok = true;
|
||||
common_log_flush(common_log_main());
|
||||
printf("|%15s|%30s|%6s|%8s|%6s|\n", "Model arch.", "Device", "Config", "NMSE", "Status");
|
||||
printf("|---------------|------------------------------|------|--------|------|\n");
|
||||
printf("|%15s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip");
|
||||
printf("|---------------|------------------------------|------|---------------|---------|\n");
|
||||
for (const llm_arch & arch : llm_arch_all()) {
|
||||
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
|
||||
continue;
|
||||
@@ -425,6 +448,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
|
||||
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
|
||||
continue; // These models don't have usable implementations.
|
||||
}
|
||||
if (arch == LLM_ARCH_CHAMELEON) {
|
||||
continue; // Only half-implemented and to be removed in the future.
|
||||
}
|
||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||
continue; // FIXME CUDA backend crashes.
|
||||
}
|
||||
@@ -458,22 +484,50 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
|
||||
continue;
|
||||
}
|
||||
gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
|
||||
auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), seed, {});
|
||||
auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {});
|
||||
const std::vector<float> logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode);
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
continue;
|
||||
}
|
||||
auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), seed, {dev});
|
||||
auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {dev});
|
||||
std::string config_name = moe ? "MoE" : "Dense";
|
||||
const std::vector<float> logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode);
|
||||
const double nmse_val = nmse(logits_cpu, logits_dev);
|
||||
const bool ok = nmse_val <= 1e-4;
|
||||
all_ok = all_ok && ok;
|
||||
char nmse_str[10];
|
||||
snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val);
|
||||
printf("|%15s|%30s|%6s|%8s|%17s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev),
|
||||
moe ? "MoE" : "Dense", nmse_str, ok ? "\033[1;32mOK\033[0m" : "\033[1;31mFAIL\033[0m");
|
||||
std::string status_nmse = "\033[1;32mOK\033[0m";
|
||||
if (nmse_val > 1e-4) {
|
||||
all_ok = false;
|
||||
status_nmse = "\033[1;31mFAIL\033[0m";
|
||||
}
|
||||
|
||||
std::string status_roundtrip = "\033[1;33mSKIP\033[0m";
|
||||
FILE * file = tmpfile(); // Can be null on Windows without administrator privileges.
|
||||
if (file != nullptr && llama_model_saver_supports_arch(arch)) {
|
||||
llama_model_saver ms = llama_model_saver(model_and_ctx_dev.first.get());
|
||||
ms.add_kv_from_model();
|
||||
ms.add_tensors_from_model();
|
||||
ms.save(file);
|
||||
rewind(file);
|
||||
|
||||
auto model_and_ctx_roundtrip = get_model_and_ctx(nullptr, file, seed, {dev});
|
||||
const std::vector<float> logits_roundtrip = get_logits(
|
||||
model_and_ctx_roundtrip.first.get(), model_and_ctx_roundtrip.second.get(), tokens, encode);
|
||||
status_roundtrip = "\033[1;32mOK\033[0m";
|
||||
GGML_ASSERT(logits_roundtrip.size() == logits_dev.size());
|
||||
for (size_t i = 0; i < logits_roundtrip.size(); i++) {
|
||||
if (logits_roundtrip[i] != logits_dev[i]) {
|
||||
all_ok = false;
|
||||
status_roundtrip = "\033[1;31mFAIL\033[0m";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("|%15s|%30s|%6s|%15s (%8s)|%20s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev),
|
||||
config_name.c_str(), status_nmse.c_str(), nmse_str, status_roundtrip.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -526,6 +580,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("%s: using seed %zu\n", __func__, seed);
|
||||
|
||||
try {
|
||||
if (!out.empty()) {
|
||||
|
||||
@@ -7,4 +7,4 @@ CLI to split / merge GGUF files.
|
||||
- `--split`: split GGUF to multiple GGUF, default operation.
|
||||
- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`.
|
||||
- `--split-max-tensors`: maximum tensors in each split: default(128)
|
||||
- `--merge`: merge multiple GGUF to a single GGUF.
|
||||
- `--merge`: merge multiple GGUF to a single GGUF. You only need to specify the name of the first GGUF to merge, the name of the merged GGUF, and the CLI will find the other GGUFs it needs within the same folder.
|
||||
|
||||
@@ -1807,7 +1807,7 @@ struct markdown_printer : public printer {
|
||||
if (!is_cpu_backend) {
|
||||
fields.emplace_back("n_gpu_layers");
|
||||
}
|
||||
if (params.n_cpu_moe.size() > 1) {
|
||||
if (params.n_cpu_moe.size() > 1 || params.n_cpu_moe != cmd_params_defaults.n_cpu_moe) {
|
||||
fields.emplace_back("n_cpu_moe");
|
||||
}
|
||||
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
||||
|
||||
Binary file not shown.
@@ -77,7 +77,7 @@
|
||||
let filteredOptions = $derived(filterModelOptions(options, searchTerm));
|
||||
|
||||
let groupedFilteredOptions = $derived(
|
||||
groupModelOptions(filteredOptions, modelsStore.favouriteModelIds, (m) =>
|
||||
groupModelOptions(filteredOptions, modelsStore.favoriteModelIds, (m) =>
|
||||
modelsStore.isModelLoaded(m)
|
||||
)
|
||||
);
|
||||
@@ -353,7 +353,7 @@
|
||||
{@const { option, flatIndex } = item}
|
||||
{@const isSelected = currentModel === option.model || activeId === option.id}
|
||||
{@const isHighlighted = flatIndex === highlightedIndex}
|
||||
{@const isFav = modelsStore.favouriteModelIds.has(option.model)}
|
||||
{@const isFav = modelsStore.favoriteModelIds.has(option.model)}
|
||||
|
||||
<ModelsSelectorOption
|
||||
{option}
|
||||
|
||||
@@ -30,7 +30,7 @@
|
||||
{#snippet defaultOption(item: ModelItem, showOrgName: boolean)}
|
||||
{@const { option } = item}
|
||||
{@const isSelected = currentModel === option.model || activeId === option.id}
|
||||
{@const isFav = modelsStore.favouriteModelIds.has(option.model)}
|
||||
{@const isFav = modelsStore.favoriteModelIds.has(option.model)}
|
||||
|
||||
<ModelsSelectorOption
|
||||
{option}
|
||||
@@ -52,9 +52,9 @@
|
||||
{/each}
|
||||
{/if}
|
||||
|
||||
{#if groups.favourites.length > 0}
|
||||
<p class={sectionHeaderClass}>Favourite models</p>
|
||||
{#each groups.favourites as item (`fav-${item.option.id}`)}
|
||||
{#if groups.favorites.length > 0}
|
||||
<p class={sectionHeaderClass}>Favorite models</p>
|
||||
{#each groups.favorites as item (`fav-${item.option.id}`)}
|
||||
{@render render(item, true)}
|
||||
{/each}
|
||||
{/if}
|
||||
|
||||
@@ -46,7 +46,10 @@
|
||||
});
|
||||
let isOperationInProgress = $derived(modelsStore.isModelOperationInProgress(option.model));
|
||||
let isFailed = $derived(serverStatus === ServerModelStatus.FAILED);
|
||||
let isLoaded = $derived(serverStatus === ServerModelStatus.LOADED && !isOperationInProgress);
|
||||
let isSleeping = $derived(serverStatus === ServerModelStatus.SLEEPING);
|
||||
let isLoaded = $derived(
|
||||
(serverStatus === ServerModelStatus.LOADED || isSleeping) && !isOperationInProgress
|
||||
);
|
||||
let isLoading = $derived(serverStatus === ServerModelStatus.LOADING || isOperationInProgress);
|
||||
</script>
|
||||
|
||||
@@ -85,17 +88,17 @@
|
||||
<ActionIcon
|
||||
iconSize="h-2.5 w-2.5"
|
||||
icon={HeartOff}
|
||||
tooltip="Remove from favourites"
|
||||
tooltip="Remove from favorites"
|
||||
class="h-3 w-3 hover:text-foreground"
|
||||
onclick={() => modelsStore.toggleFavourite(option.model)}
|
||||
onclick={() => modelsStore.toggleFavorite(option.model)}
|
||||
/>
|
||||
{:else}
|
||||
<ActionIcon
|
||||
iconSize="h-2.5 w-2.5"
|
||||
icon={Heart}
|
||||
tooltip="Add to favourites"
|
||||
tooltip="Add to favorites"
|
||||
class="h-3 w-3 hover:text-foreground"
|
||||
onclick={() => modelsStore.toggleFavourite(option.model)}
|
||||
onclick={() => modelsStore.toggleFavorite(option.model)}
|
||||
/>
|
||||
{/if}
|
||||
|
||||
@@ -129,6 +132,23 @@
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
{:else if isSleeping}
|
||||
<div class="flex w-4 items-center justify-center">
|
||||
<span class="h-2 w-2 rounded-full bg-orange-400 group-hover:hidden"></span>
|
||||
|
||||
<div class="hidden group-hover:flex">
|
||||
<ActionIcon
|
||||
iconSize="h-2.5 w-2.5"
|
||||
icon={PowerOff}
|
||||
tooltip="Unload model"
|
||||
class="h-3 w-3 text-red-500 hover:text-red-600"
|
||||
onclick={(e) => {
|
||||
e?.stopPropagation();
|
||||
modelsStore.unloadModel(option.model);
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
{:else if isLoaded}
|
||||
<div class="flex w-4 items-center justify-center">
|
||||
<span class="h-2 w-2 rounded-full bg-green-500 group-hover:hidden"></span>
|
||||
|
||||
@@ -76,7 +76,7 @@
|
||||
let filteredOptions = $derived(filterModelOptions(options, searchTerm));
|
||||
|
||||
let groupedFilteredOptions = $derived(
|
||||
groupModelOptions(filteredOptions, modelsStore.favouriteModelIds, (m) =>
|
||||
groupModelOptions(filteredOptions, modelsStore.favoriteModelIds, (m) =>
|
||||
modelsStore.isModelLoaded(m)
|
||||
)
|
||||
);
|
||||
|
||||
@@ -47,7 +47,7 @@ export { default as ModelsSelector } from './ModelsSelector.svelte';
|
||||
/**
|
||||
* **ModelsSelectorList** - Grouped model options list
|
||||
*
|
||||
* Renders grouped model options (loaded, favourites, available) with section
|
||||
* Renders grouped model options (loaded, favorites, available) with section
|
||||
* headers and org subgroups. Shared between ModelsSelector and ModelsSelectorSheet
|
||||
* to avoid template duplication.
|
||||
*
|
||||
@@ -59,7 +59,7 @@ export { default as ModelsSelectorList } from './ModelsSelectorList.svelte';
|
||||
/**
|
||||
* **ModelsSelectorOption** - Single model option row
|
||||
*
|
||||
* Renders a single model option with selection state, favourite toggle,
|
||||
* Renders a single model option with selection state, favorite toggle,
|
||||
* load/unload actions, status indicators, and an info button.
|
||||
* Used inside ModelsSelectorList or directly in custom render snippets.
|
||||
*/
|
||||
|
||||
@@ -13,7 +13,7 @@ export interface OrgGroup {
|
||||
|
||||
export interface GroupedModelOptions {
|
||||
loaded: ModelItem[];
|
||||
favourites: ModelItem[];
|
||||
favorites: ModelItem[];
|
||||
available: OrgGroup[];
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ export function filterModelOptions(options: ModelOption[], searchTerm: string):
|
||||
|
||||
export function groupModelOptions(
|
||||
filteredOptions: ModelOption[],
|
||||
favouriteIds: Set<string>,
|
||||
favoriteIds: Set<string>,
|
||||
isModelLoaded: (model: string) => boolean
|
||||
): GroupedModelOptions {
|
||||
// Loaded models
|
||||
@@ -43,24 +43,24 @@ export function groupModelOptions(
|
||||
}
|
||||
}
|
||||
|
||||
// Favourites (excluding loaded)
|
||||
// Favorites (excluding loaded)
|
||||
const loadedModelIds = new Set(loaded.map((item) => item.option.model));
|
||||
const favourites: ModelItem[] = [];
|
||||
const favorites: ModelItem[] = [];
|
||||
for (let i = 0; i < filteredOptions.length; i++) {
|
||||
if (
|
||||
favouriteIds.has(filteredOptions[i].model) &&
|
||||
favoriteIds.has(filteredOptions[i].model) &&
|
||||
!loadedModelIds.has(filteredOptions[i].model)
|
||||
) {
|
||||
favourites.push({ option: filteredOptions[i], flatIndex: i });
|
||||
favorites.push({ option: filteredOptions[i], flatIndex: i });
|
||||
}
|
||||
}
|
||||
|
||||
// Available models grouped by org (excluding loaded and favourites)
|
||||
// Available models grouped by org (excluding loaded and favorites)
|
||||
const available: OrgGroup[] = [];
|
||||
const orgGroups = new SvelteMap<string, ModelItem[]>();
|
||||
for (let i = 0; i < filteredOptions.length; i++) {
|
||||
const option = filteredOptions[i];
|
||||
if (loadedModelIds.has(option.model) || favouriteIds.has(option.model)) continue;
|
||||
if (loadedModelIds.has(option.model) || favoriteIds.has(option.model)) continue;
|
||||
|
||||
const key = option.parsedId?.orgName ?? '';
|
||||
if (!orgGroups.has(key)) orgGroups.set(key, []);
|
||||
@@ -71,5 +71,5 @@ export function groupModelOptions(
|
||||
available.push({ orgName: orgName || null, items });
|
||||
}
|
||||
|
||||
return { loaded, favourites, available };
|
||||
return { loaded, favorites, available };
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
export const CONFIG_LOCALSTORAGE_KEY = 'LlamaCppWebui.config';
|
||||
export const USER_OVERRIDES_LOCALSTORAGE_KEY = 'LlamaCppWebui.userOverrides';
|
||||
export const FAVOURITE_MODELS_LOCALSTORAGE_KEY = 'LlamaCppWebui.favouriteModels';
|
||||
export const FAVORITE_MODELS_LOCALSTORAGE_KEY = 'LlamaCppWebui.favoriteModels';
|
||||
export const MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY = 'LlamaCppWebui.mcpDefaultEnabled';
|
||||
|
||||
@@ -16,5 +16,6 @@ export enum ServerModelStatus {
|
||||
UNLOADED = 'unloaded',
|
||||
LOADING = 'loading',
|
||||
LOADED = 'loaded',
|
||||
SLEEPING = 'sleeping',
|
||||
FAILED = 'failed'
|
||||
}
|
||||
|
||||
@@ -1207,7 +1207,6 @@ class ChatStore {
|
||||
await conversationsStore.updateCurrentNode(newMessage.id);
|
||||
} else {
|
||||
await DatabaseService.updateMessage(msg.id, { content: newContent });
|
||||
await conversationsStore.updateCurrentNode(msg.id);
|
||||
conversationsStore.updateMessageAtIndex(idx, { content: newContent });
|
||||
}
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ import { TTLCache } from '$lib/utils';
|
||||
import {
|
||||
MODEL_PROPS_CACHE_TTL_MS,
|
||||
MODEL_PROPS_CACHE_MAX_ENTRIES,
|
||||
FAVOURITE_MODELS_LOCALSTORAGE_KEY
|
||||
FAVORITE_MODELS_LOCALSTORAGE_KEY
|
||||
} from '$lib/constants';
|
||||
|
||||
/**
|
||||
@@ -57,7 +57,7 @@ class ModelsStore {
|
||||
private modelUsage = $state<Map<string, SvelteSet<string>>>(new Map());
|
||||
private modelLoadingStates = new SvelteMap<string, boolean>();
|
||||
|
||||
favouriteModelIds = $state<Set<string>>(this.loadFavouritesFromStorage());
|
||||
favoriteModelIds = $state<Set<string>>(this.loadFavoritesFromStorage());
|
||||
|
||||
/**
|
||||
* Model-specific props cache with TTL
|
||||
@@ -90,7 +90,11 @@ class ModelsStore {
|
||||
|
||||
get loadedModelIds(): string[] {
|
||||
return this.routerModels
|
||||
.filter((m) => m.status.value === ServerModelStatus.LOADED)
|
||||
.filter(
|
||||
(m) =>
|
||||
m.status.value === ServerModelStatus.LOADED ||
|
||||
m.status.value === ServerModelStatus.SLEEPING
|
||||
)
|
||||
.map((m) => m.id);
|
||||
}
|
||||
|
||||
@@ -215,7 +219,11 @@ class ModelsStore {
|
||||
|
||||
isModelLoaded(modelId: string): boolean {
|
||||
const model = this.routerModels.find((m) => m.id === modelId);
|
||||
return model?.status.value === ServerModelStatus.LOADED || false;
|
||||
return (
|
||||
model?.status.value === ServerModelStatus.LOADED ||
|
||||
model?.status.value === ServerModelStatus.SLEEPING ||
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
isModelOperationInProgress(modelId: string): boolean {
|
||||
@@ -621,17 +629,17 @@ class ModelsStore {
|
||||
/**
|
||||
*
|
||||
*
|
||||
* Favourites
|
||||
* Favorites
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
isFavourite(modelId: string): boolean {
|
||||
return this.favouriteModelIds.has(modelId);
|
||||
isFavorite(modelId: string): boolean {
|
||||
return this.favoriteModelIds.has(modelId);
|
||||
}
|
||||
|
||||
toggleFavourite(modelId: string): void {
|
||||
const next = new SvelteSet(this.favouriteModelIds);
|
||||
toggleFavorite(modelId: string): void {
|
||||
const next = new SvelteSet(this.favoriteModelIds);
|
||||
|
||||
if (next.has(modelId)) {
|
||||
next.delete(modelId);
|
||||
@@ -639,22 +647,22 @@ class ModelsStore {
|
||||
next.add(modelId);
|
||||
}
|
||||
|
||||
this.favouriteModelIds = next;
|
||||
this.favoriteModelIds = next;
|
||||
|
||||
try {
|
||||
localStorage.setItem(FAVOURITE_MODELS_LOCALSTORAGE_KEY, JSON.stringify([...next]));
|
||||
localStorage.setItem(FAVORITE_MODELS_LOCALSTORAGE_KEY, JSON.stringify([...next]));
|
||||
} catch {
|
||||
toast.error('Failed to save favourite models to local storage');
|
||||
toast.error('Failed to save favorite models to local storage');
|
||||
}
|
||||
}
|
||||
|
||||
private loadFavouritesFromStorage(): Set<string> {
|
||||
private loadFavoritesFromStorage(): Set<string> {
|
||||
try {
|
||||
const raw = localStorage.getItem(FAVOURITE_MODELS_LOCALSTORAGE_KEY);
|
||||
const raw = localStorage.getItem(FAVORITE_MODELS_LOCALSTORAGE_KEY);
|
||||
|
||||
return raw ? new Set(JSON.parse(raw) as string[]) : new Set();
|
||||
} catch {
|
||||
toast.error('Failed to load favourite models from local storage');
|
||||
toast.error('Failed to load favorite models from local storage');
|
||||
|
||||
return new Set();
|
||||
}
|
||||
@@ -713,4 +721,4 @@ export const loadingModelIds = () => modelsStore.loadingModelIds;
|
||||
export const propsCacheVersion = () => modelsStore.propsCacheVersion;
|
||||
export const singleModelName = () => modelsStore.singleModelName;
|
||||
export const selectedModelContextSize = () => modelsStore.selectedModelContextSize;
|
||||
export const favouriteModelIds = () => modelsStore.favouriteModelIds;
|
||||
export const favoriteModelIds = () => modelsStore.favoriteModelIds;
|
||||
|
||||
2
tools/server/webui/src/lib/types/api.d.ts
vendored
2
tools/server/webui/src/lib/types/api.d.ts
vendored
@@ -54,7 +54,7 @@ export interface ApiChatMessageData {
|
||||
* Model status object from /models endpoint
|
||||
*/
|
||||
export interface ApiModelStatus {
|
||||
/** Status value: loaded, unloaded, loading, failed */
|
||||
/** Status value: loaded, unloaded, loading, sleeping, failed */
|
||||
value: ServerModelStatus;
|
||||
/** Command line arguments used when loading (only for loaded models) */
|
||||
args?: string[];
|
||||
|
||||
Reference in New Issue
Block a user