mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
83 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cf2270e4d3 | ||
|
|
f07690c930 | ||
|
|
891c63956d | ||
|
|
2f21123c1d | ||
|
|
374101fd74 | ||
|
|
b3c9a65673 | ||
|
|
8ba95dca20 | ||
|
|
dc079cfdff | ||
|
|
7b61bcc87c | ||
|
|
f4c3dd5daa | ||
|
|
3d35d87b41 | ||
|
|
b19bd064c0 | ||
|
|
92a391327e | ||
|
|
9f2250ba72 | ||
|
|
774973b8f3 | ||
|
|
8fcb563613 | ||
|
|
add2a3aa5a | ||
|
|
c522ce4143 | ||
|
|
081bee8c64 | ||
|
|
84d5475541 | ||
|
|
be7c303410 | ||
|
|
e0dbec0bc6 | ||
|
|
2048b5913d | ||
|
|
f08f4b3187 | ||
|
|
80a02aa858 | ||
|
|
363f8c5d67 | ||
|
|
34c961b181 | ||
|
|
7841fc723e | ||
|
|
bf69cfe62f | ||
|
|
10f2e81809 | ||
|
|
ba7654380a | ||
|
|
6ab2e4765a | ||
|
|
96e1280839 | ||
|
|
2c9f833d17 | ||
|
|
251364549f | ||
|
|
8acdacb3ea | ||
|
|
89b2b56e86 | ||
|
|
e128a1bf5b | ||
|
|
6ef79a67ca | ||
|
|
4e39a3c332 | ||
|
|
be421fc429 | ||
|
|
87c2630546 | ||
|
|
2b3a25c212 | ||
|
|
8352cdc87b | ||
|
|
1e2f78a004 | ||
|
|
0fd7ca7a21 | ||
|
|
6fefc05a7a | ||
|
|
7ab364390f | ||
|
|
7c7f3b7f43 | ||
|
|
102ac1891d | ||
|
|
d6ae2fa061 | ||
|
|
68d0027f3d | ||
|
|
ea002810a2 | ||
|
|
8fad3c7a7c | ||
|
|
7cf64f6bee | ||
|
|
5e2d57b2b2 | ||
|
|
f1648e91cf | ||
|
|
d6c95b0740 | ||
|
|
d76a86d967 | ||
|
|
776f9e59cc | ||
|
|
3d652bfddf | ||
|
|
5220a16d18 | ||
|
|
3ffbbd5ce1 | ||
|
|
42994048a3 | ||
|
|
e9b2f84f14 | ||
|
|
e721c05c93 | ||
|
|
57b6abf85a | ||
|
|
94bb63e4f0 | ||
|
|
f79243992c | ||
|
|
ed4ce0dda2 | ||
|
|
07d1572347 | ||
|
|
5e43f104cc | ||
|
|
16e4b22c5e | ||
|
|
074c4fd39d | ||
|
|
669912d9a5 | ||
|
|
fa31c438e0 | ||
|
|
3ccbfe5a71 | ||
|
|
06a92a193a | ||
|
|
a057897ad4 | ||
|
|
5bbe6a9fe9 | ||
|
|
20a9b8f5e1 | ||
|
|
56d7a9f812 | ||
|
|
1a24c4621f |
54
.github/workflows/build.yml
vendored
54
.github/workflows/build.yml
vendored
@@ -467,6 +467,7 @@ jobs:
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
-DGGML_HIP=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
@@ -476,6 +477,7 @@ jobs:
|
||||
cmake -B build2 -S . \
|
||||
-DCMAKE_C_COMPILER=hipcc \
|
||||
-DCMAKE_CXX_COMPILER=hipcc \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
-DGGML_HIP=ON
|
||||
cmake --build build2 --config Release -j $(nproc)
|
||||
|
||||
@@ -710,12 +712,11 @@ jobs:
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
sudo cmake --install build --config Release
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}"
|
||||
./build-xcframework.sh
|
||||
|
||||
windows-msys2:
|
||||
runs-on: windows-latest
|
||||
@@ -773,7 +774,7 @@ jobs:
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
SDE_VERSION: 9.33.0-2024-01-07
|
||||
VULKAN_VERSION: 1.3.261.1
|
||||
VULKAN_VERSION: 1.4.304.1
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -1203,6 +1204,11 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Clone rocWMMA repository
|
||||
id: clone_rocwmma
|
||||
run: |
|
||||
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
|
||||
|
||||
- name: Install
|
||||
id: depends
|
||||
run: |
|
||||
@@ -1232,8 +1238,10 @@ jobs:
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DGGML_HIP=ON `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
@@ -1252,6 +1260,11 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Clone rocWMMA repository
|
||||
id: clone_rocwmma
|
||||
run: |
|
||||
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
@@ -1281,8 +1294,10 @@ jobs:
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGGML_HIP=ON `
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
@@ -1321,6 +1336,8 @@ jobs:
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -1336,15 +1353,40 @@ jobs:
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
sudo cmake --install build --config Release
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
xcodebuild -scheme llama-Package -destination 'generic/platform=iOS'
|
||||
./build-xcframework.sh
|
||||
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
name: llama-${{ steps.tag.outputs.name }}-xcframework
|
||||
|
||||
android-build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
61
AUTHORS
61
AUTHORS
@@ -1,4 +1,4 @@
|
||||
# date: Tue Feb 4 13:04:05 EET 2025
|
||||
# date: Sat Mar 8 18:23:52 EET 2025
|
||||
# this file is auto-generated by scripts/gen-authors.sh
|
||||
|
||||
0cc4m <picard12@live.de>
|
||||
@@ -8,10 +8,12 @@
|
||||
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
||||
44670 <44670@users.noreply.github.com>
|
||||
65a <10104049+65a@users.noreply.github.com>
|
||||
708-145 <40387547+708-145@users.noreply.github.com>
|
||||
AN Long <aisk@users.noreply.github.com>
|
||||
AT <manyoso@users.noreply.github.com>
|
||||
Aarni Koskela <akx@iki.fi>
|
||||
Aaron Miller <apage43@ninjawhale.com>
|
||||
Aaron Teo <57927438+taronaeo@users.noreply.github.com>
|
||||
Aaryaman Vasishta <aaryaman.vasishta@amd.com>
|
||||
Abheek Gulati <abheekg@hotmail.com>
|
||||
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
|
||||
@@ -20,6 +22,7 @@ Adithya Balaji <adithya.b94@gmail.com>
|
||||
AdithyanI <adithyan.i4internet@gmail.com>
|
||||
Adrian <smith.adriane@gmail.com>
|
||||
Adrian Hesketh <a-h@users.noreply.github.com>
|
||||
Adrian Kretz <me@akretz.com>
|
||||
Adrien Gallouët <adrien@gallouet.fr>
|
||||
Adrien Gallouët <angt@huggingface.co>
|
||||
Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
|
||||
@@ -28,15 +31,18 @@ AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
||||
AidanBeltonS <aidan.belton@codeplay.com>
|
||||
Aisuko <urakiny@gmail.com>
|
||||
Akarshan Biswas <akarshan.biswas@gmail.com>
|
||||
Akarshan Biswas <akarshan@menlo.ai>
|
||||
Akarshan Biswas <akarshanbiswas@fedoraproject.org>
|
||||
Al Mochkin <14274697+amochkin@users.noreply.github.com>
|
||||
Albert Jin <albert.jin@gmail.com>
|
||||
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
||||
Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
|
||||
Alberto Cabrera Pérez <alberto.cabrera@intel.com>
|
||||
Aleksei Nikiforov <103434461+AlekseiNikiforovIBM@users.noreply.github.com>
|
||||
Alex <awhill19@icloud.com>
|
||||
Alex Azarov <alex@azarov.by>
|
||||
Alex Azarov <alexander.azarov@mapbox.com>
|
||||
Alex Brooks <alex.brooks@ibm.com>
|
||||
Alex Klinkhamer <from.github.com.917@grencez.dev>
|
||||
Alex Klinkhamer <git@grencez.dev>
|
||||
Alex Nguyen <tiendung@users.noreply.github.com>
|
||||
@@ -67,6 +73,7 @@ Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
|
||||
Andy Salerno <andysalerno@gmail.com>
|
||||
Andy Tai <andy-tai@users.noreply.github.com>
|
||||
Anthony Van de Gejuchte <anthonyvdgent@gmail.com>
|
||||
Antoine Viallon <antoine@lesviallon.fr>
|
||||
Antonis Makropoulos <benuix@gmail.com>
|
||||
Arik Poznanski <arikpoz@users.noreply.github.com>
|
||||
Armen Kaleshian <kriation@users.noreply.github.com>
|
||||
@@ -83,6 +90,7 @@ Atsushi Tatsuma <yoshoku@outlook.com>
|
||||
Austin <77757836+teleprint-me@users.noreply.github.com>
|
||||
AustinMroz <austinmroz@utexas.edu>
|
||||
BADR <contact@pythops.com>
|
||||
BB-fat <45072480+BB-fat@users.noreply.github.com>
|
||||
Bach Le <bach@bullno1.com>
|
||||
Bailey Chittle <39804642+bachittle@users.noreply.github.com>
|
||||
BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
|
||||
@@ -101,6 +109,7 @@ Bert Wagner <github@bertwagner.com>
|
||||
Billel Mokeddem <billel.mokeddem.ml@gmail.com>
|
||||
Bingan <70050083+binganao@users.noreply.github.com>
|
||||
Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
|
||||
Bodhi <3882561+BodhiHu@users.noreply.github.com>
|
||||
Bodo Graumann <mail@bodograumann.de>
|
||||
Bono Lv <lvscar@users.noreply.github.com>
|
||||
Borislav Stanimirov <b.stanimirov@abv.bg>
|
||||
@@ -128,6 +137,7 @@ CentricStorm <CentricStorm@users.noreply.github.com>
|
||||
Chad Brewbaker <crb002@gmail.com>
|
||||
Changyeon Kim <cyzero.kim@samsung.com>
|
||||
Chao Jiang <jc19chaoj@zoho.com>
|
||||
Charles Duffy <charles@dyfis.net>
|
||||
Charles Xu <63788048+chaxu01@users.noreply.github.com>
|
||||
Charles Xu <charles.xu@arm.com>
|
||||
Chen Xi <xi2.chen@intel.com>
|
||||
@@ -139,12 +149,14 @@ Chris Kuehl <ckuehl@ckuehl.me>
|
||||
Christian Demsar <christian@github.email.demsar.us>
|
||||
Christian Demsar <crasm@git.vczf.us>
|
||||
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
||||
Christian Fillion <cfillion@users.noreply.github.com>
|
||||
Christian Kastner <ckk@kvr.at>
|
||||
Christian Kögler <ck3d@gmx.de>
|
||||
Christian Köhnenkamp <cvk5@me.com>
|
||||
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
|
||||
Christopher Nielsen <62156882+mascguy@users.noreply.github.com>
|
||||
Clark Saben <76020733+csaben@users.noreply.github.com>
|
||||
Clauszy <zhangyub@uniontech.com>
|
||||
Clint Herron <hanclinto@gmail.com>
|
||||
Conrad Kramer <conrad@conradkramer.com>
|
||||
Corentin REGAL <corentin.regal@gmail.com>
|
||||
@@ -163,6 +175,7 @@ Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
||||
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
||||
Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
|
||||
Daniele <57776841+daniandtheweb@users.noreply.github.com>
|
||||
Danny Milosavljevic <dannym@friendly-machines.com>
|
||||
DannyDaemonic <DannyDaemonic@gmail.com>
|
||||
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
||||
Dave <dave-fl@users.noreply.github.com>
|
||||
@@ -170,6 +183,7 @@ Dave Airlie <airlied@gmail.com>
|
||||
Dave Airlie <airlied@redhat.com>
|
||||
Dave Della Costa <ddellacosta+github@gmail.com>
|
||||
David Friehs <david@friehs.info>
|
||||
David Huang <1969802+hjc4869@users.noreply.github.com>
|
||||
David Kennedy <dakennedyd@gmail.com>
|
||||
David Pflug <david@pflug.email>
|
||||
David Renshaw <dwrenshaw@gmail.com>
|
||||
@@ -236,6 +250,7 @@ Felix <stenbackfelix@gmail.com>
|
||||
Finn Voorhees <finnvoorhees@gmail.com>
|
||||
Firat <firatkiral@gmail.com>
|
||||
FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
|
||||
Florent BENOIT <fbenoit@redhat.com>
|
||||
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
||||
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
||||
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
||||
@@ -254,6 +269,7 @@ Gary Mulder <gjmulder@gmail.com>
|
||||
Gavin Zhao <gavinzhaojw@protonmail.com>
|
||||
Genkagaku.GPT <hlhr202@163.com>
|
||||
Georgi Gerganov <ggerganov@gmail.com>
|
||||
Gian-Carlo Pascutto <gcp@sjeng.org>
|
||||
Gilad S <giladgd@users.noreply.github.com>
|
||||
Gilad S. <7817232+giladgd@users.noreply.github.com>
|
||||
Giuseppe Scrivano <giuseppe@scrivano.org>
|
||||
@@ -267,7 +283,9 @@ Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com>
|
||||
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
||||
Haggai Nuchi <h.nuchi@gmail.com>
|
||||
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
|
||||
Hale Chan <halechan@qq.com>
|
||||
Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com>
|
||||
Han Yin <han.yin@arm.com>
|
||||
HanishKVC <hanishkvc@gmail.com>
|
||||
Haohui Mai <ricetons@gmail.com>
|
||||
Haoxiang Fei <tonyfettes@tonyfettes.com>
|
||||
@@ -278,6 +296,7 @@ Haus1 <haus.xda@gmail.com>
|
||||
Henk Poley <HenkPoley@gmail.com>
|
||||
Henri Vasserman <henv@hot.ee>
|
||||
Henrik Forstén <henrik.forsten@gmail.com>
|
||||
Henry Linjamäki <henry.linjamaki@gmail.com>
|
||||
Herman Semenov <GermanAizek@yandex.ru>
|
||||
Hesen Peng <hesen.peng@gmail.com>
|
||||
HimariO <dsfhe49854@gmail.com>
|
||||
@@ -307,6 +326,7 @@ Ivan <nekotekina@gmail.com>
|
||||
Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
|
||||
Ivan Komarov <Ivan.Komarov@dfyz.info>
|
||||
Ivan Stepanov <ivanstepanovftw@gmail.com>
|
||||
JC <43374599+MrSMlT@users.noreply.github.com>
|
||||
JFLFY2255 <JFLFY2255@163.com>
|
||||
JH23X <165871467+JH23X@users.noreply.github.com>
|
||||
Jack Mousseau <jack@software.inc>
|
||||
@@ -325,6 +345,7 @@ Jan Ploski <jpl@plosquare.com>
|
||||
Jannis Schönleber <joennlae@gmail.com>
|
||||
Jared Van Bortel <cebtenzzre@gmail.com>
|
||||
Jared Van Bortel <jared@nomic.ai>
|
||||
Jason C.H <ctrysbita@outlook.com>
|
||||
Jason McCartney <jmac@theroot.org>
|
||||
Jason Stillerman <jason.t.stillerman@gmail.com>
|
||||
Jean-Christophe Hoelt <hoelt@fovea.cc>
|
||||
@@ -342,6 +363,7 @@ Jiahao Li <liplus17@163.com>
|
||||
Jian Liao <jianliao@users.noreply.github.com>
|
||||
JidongZhang-THU <1119708529@qq.com>
|
||||
Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
|
||||
Jinyang He <hejinyang@loongson.cn>
|
||||
Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
|
||||
Jiří Sejkora <Sejseloid@gmail.com>
|
||||
Joan Fontanals <jfontanalsmartinez@gmail.com>
|
||||
@@ -379,6 +401,7 @@ Justine Tunney <jtunney@mozilla.com>
|
||||
Juuso Alasuutari <juuso.alasuutari@gmail.com>
|
||||
KASR <karim.asrih@gmail.com>
|
||||
Kamil Tomšík <info@tomsik.cz>
|
||||
Kante Yin <kerthcet@gmail.com>
|
||||
Karol Kontny <82021046+kkontny@users.noreply.github.com>
|
||||
Karsten Weiss <knweiss@gmail.com>
|
||||
Karthick <j.karthic2004@gmail.com>
|
||||
@@ -419,6 +442,7 @@ LoganDark <github@logandark.mozmail.com>
|
||||
Loïc Carrère <loic.carrere@gmail.com>
|
||||
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
||||
LostRuins Concedo <39025047+LostRuins@users.noreply.github.com>
|
||||
Lucas Moura Belo <lucas.belo@live.com>
|
||||
Luciano <lucianostrika44@gmail.com>
|
||||
Luo Tian <lt@basecity.com>
|
||||
Lyle Dean <dean@lyle.dev>
|
||||
@@ -463,6 +487,7 @@ Matthew Tejo <matthew.tejo@gmail.com>
|
||||
Matvey Soloviev <blackhole89@gmail.com>
|
||||
Max Krasnyansky <max.krasnyansky@gmail.com>
|
||||
Max Krasnyansky <quic_maxk@quicinc.com>
|
||||
Maxim Evtush <154841002+maximevtush@users.noreply.github.com>
|
||||
Maxime <672982+maximegmd@users.noreply.github.com>
|
||||
Maximilian Winter <maximilian.winter.91@gmail.com>
|
||||
Meng Zhang <meng@tabbyml.com>
|
||||
@@ -494,6 +519,7 @@ Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
||||
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
||||
Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
|
||||
Molly Sophia <mollysophia379@gmail.com>
|
||||
MoonRide303 <130458190+MoonRide303@users.noreply.github.com>
|
||||
MorganRO8 <47795945+MorganRO8@users.noreply.github.com>
|
||||
Murilo Santana <mvrilo@gmail.com>
|
||||
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
||||
@@ -524,6 +550,7 @@ Nikolas <127742645+nneubacher@users.noreply.github.com>
|
||||
Nindaleth <Nindaleth@users.noreply.github.com>
|
||||
Nuno <rare-magma@posteo.eu>
|
||||
OSecret <135510162+OLSecret@users.noreply.github.com>
|
||||
Oleksandr Kuvshynov <661042+okuvshynov@users.noreply.github.com>
|
||||
Oleksandr Nikitin <oleksandr@tvori.info>
|
||||
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
||||
Olivier Chafik <ochafik@users.noreply.github.com>
|
||||
@@ -533,6 +560,7 @@ PAB <pierreantoine.bannier@gmail.com>
|
||||
Pablo Duboue <pablo.duboue@gmail.com>
|
||||
Pascal Patry <ppatry@mtacitlabs.com>
|
||||
Patrice Ferlet <metal3d@gmail.com>
|
||||
Patrick Peng <retr0@retr0.blog>
|
||||
Paul Tsochantaris <ptsochantaris@icloud.com>
|
||||
Pavel Zloi <github.com@drteam.rocks>
|
||||
Pavol Rusnak <pavol@rusnak.io>
|
||||
@@ -549,6 +577,7 @@ Pieter Ouwerkerk <pieter.ouwerkerk@gmail.com>
|
||||
Plamen Minev <pacominev@gmail.com>
|
||||
Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
|
||||
Przemysław Pawełczyk <przemoc@gmail.com>
|
||||
PureJourney <edward.pong@qq.com>
|
||||
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
|
||||
Qingyou Meng <meng.qingyou@gmail.com>
|
||||
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
|
||||
@@ -564,14 +593,17 @@ Rand Xie <randxiexyy29@gmail.com>
|
||||
Randall Fitzgerald <randall@dasaku.net>
|
||||
Random Fly <renfei8@live.cn>
|
||||
Reinforce-II <fate@eastal.com>
|
||||
Rémy O <remyoudompheng@gmail.com>
|
||||
Rémy Oudompheng <oudomphe@phare.normalesup.org>
|
||||
Ren Xuancheng <jklj077@users.noreply.github.com>
|
||||
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
|
||||
Reza Kakhki <rezakakhki.de@gmail.com>
|
||||
Reza Rahemtola <49811529+RezaRahemtola@users.noreply.github.com>
|
||||
RhinoDevel <RhinoDevel@users.noreply.github.com>
|
||||
Riccardo Orlando <Riccorl@users.noreply.github.com>
|
||||
Riceball LEE <snowyu.lee@gmail.com>
|
||||
Rich Dougherty <rich@rd.nz>
|
||||
Richard <r-burton@hotmail.co.uk>
|
||||
Richard Kiss <him@richardkiss.com>
|
||||
Richard Roberson <richardr1126@gmail.com>
|
||||
Rick G <26732651+TheFlipbook@users.noreply.github.com>
|
||||
@@ -588,6 +620,7 @@ Robert Sung-wook Shin <edp1096@users.noreply.github.com>
|
||||
Robey Holderith <robey@flaminglunchbox.net>
|
||||
Robyn <robyngraf@users.noreply.github.com>
|
||||
Roger Meier <r.meier@siemens.com>
|
||||
Rohanjames1997 <rohan.james4@gmail.com>
|
||||
Roland <14355895+rbur0425@users.noreply.github.com>
|
||||
Romain Biessy <romain.biessy@codeplay.com>
|
||||
Romain D <90720+Artefact2@users.noreply.github.com>
|
||||
@@ -610,6 +643,7 @@ Ryan Landay <rlanday@gmail.com>
|
||||
Ryder Wishart <ryderwishart@gmail.com>
|
||||
Ryuei <louixs@users.noreply.github.com>
|
||||
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
||||
SAMI <samuel.koesnadi@stud.uni-due.de>
|
||||
SRHMorris <69468379+SRHMorris@users.noreply.github.com>
|
||||
SXX <sxx1136965276@gmail.com>
|
||||
SakuraUmi <yukinon244@gmail.com>
|
||||
@@ -634,6 +668,8 @@ Shane A <shanea@allenai.org>
|
||||
Shangning Xu <32517059+xushangning@users.noreply.github.com>
|
||||
Shankar <gshankar.87@gmail.com>
|
||||
Shanshan Shen <467638484@qq.com>
|
||||
Shelby Jenkins <47464908+ShelbyJenkins@users.noreply.github.com>
|
||||
Sheldon Robinson <sheldon.robinson@live.com>
|
||||
Shijie <821898965@qq.com>
|
||||
Shintarou Okada <kokuzen@gmail.com>
|
||||
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
||||
@@ -713,18 +749,24 @@ Victor Nogueira <felladrin@gmail.com>
|
||||
Victor Z. Peng <ziliangdotme@gmail.com>
|
||||
Viet-Anh NGUYEN (Andrew) <vietanh.dev@gmail.com>
|
||||
Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
|
||||
Vitali Lovich <vlovich+github@gmail.com>
|
||||
Vivian <vynride@gmail.com>
|
||||
Vlad <spitfireage@gmail.com>
|
||||
Vladimir <bogdad@gmail.com>
|
||||
Vladimir Malyutin <first-leon@yandex.ru>
|
||||
Vladimir Vuksanovic <109677816+vvuksanovic@users.noreply.github.com>
|
||||
Vladimir Zorin <vladimir@deviant.guru>
|
||||
VoidIsVoid <343750470@qq.com>
|
||||
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
|
||||
Wagner Bruna <wbruna@users.noreply.github.com>
|
||||
Wang Qin <37098874+wangqin0@users.noreply.github.com>
|
||||
Wang Ran (汪然) <wangr@smail.nju.edu.cn>
|
||||
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
|
||||
Weird Constructor <weirdconstructor@gmail.com>
|
||||
Weizhao Ouyang <o451686892@gmail.com>
|
||||
Welby Seely <welbyseely@gmail.com>
|
||||
Wentai Zhang <rchardx@gmail.com>
|
||||
Wilken Gottwalt <12194808+wgottwalt@users.noreply.github.com>
|
||||
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
|
||||
William Tambellini <william.tambellini@gmail.com>
|
||||
William Tambellini <wtambellini@sdl.com>
|
||||
@@ -816,6 +858,8 @@ chaihahaha <chai836275709@gmail.com>
|
||||
chiranko <96988916+chiranko@users.noreply.github.com>
|
||||
clibdev <52199778+clibdev@users.noreply.github.com>
|
||||
clyang <clyang@clyang.net>
|
||||
cmdr2 <secondary.cmdr2@gmail.com>
|
||||
cmdr2 <shashank.shekhar.global@gmail.com>
|
||||
cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
|
||||
codezjx <code.zjx@gmail.com>
|
||||
coezbek <c.oezbek@gmail.com>
|
||||
@@ -835,6 +879,7 @@ deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
||||
devojony <61173062+devojony@users.noreply.github.com>
|
||||
ditsuke <ditsuke@protonmail.com>
|
||||
divinity76 <divinity76@gmail.com>
|
||||
dm4 <dm4@secondstate.io>
|
||||
dm4 <sunrisedm4@gmail.com>
|
||||
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
||||
drbh <david.richard.holtz@gmail.com>
|
||||
@@ -849,6 +894,7 @@ fairydreaming <166155368+fairydreaming@users.noreply.github.com>
|
||||
fengerhu1 <2748250768@qq.com>
|
||||
fj-y-saito <85871716+fj-y-saito@users.noreply.github.com>
|
||||
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
||||
fxzjshm <11426482+fxzjshm@users.noreply.github.com>
|
||||
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
||||
gliptic <gliptic@users.noreply.github.com>
|
||||
gn64 <yukikaze.jp@gmail.com>
|
||||
@@ -873,6 +919,7 @@ hydai <z54981220@gmail.com>
|
||||
iSma <ismail.senhaji@gmail.com>
|
||||
iacore <74560659+iacore@users.noreply.github.com>
|
||||
icppWorld <124377669+icppWorld@users.noreply.github.com>
|
||||
igardev <49397134+igardev@users.noreply.github.com>
|
||||
igarnier <igarnier@protonmail.com>
|
||||
intelmatt <61025942+intelmatt@users.noreply.github.com>
|
||||
iohub <rickyang.pro@gmail.com>
|
||||
@@ -880,6 +927,7 @@ issixx <46835150+issixx@users.noreply.github.com>
|
||||
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
||||
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
|
||||
jameswu2014 <545426914@qq.com>
|
||||
jason_w <jason.wang@126.com>
|
||||
jdomke <28772296+jdomke@users.noreply.github.com>
|
||||
jiahao su <damow890@gmail.com>
|
||||
jiez <373447296@qq.com>
|
||||
@@ -891,6 +939,7 @@ jon-chuang <9093549+jon-chuang@users.noreply.github.com>
|
||||
jp-x-g <jpxg-dev@protonmail.com>
|
||||
jukofyork <69222624+jukofyork@users.noreply.github.com>
|
||||
junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
|
||||
junchao-zhao <68935141+junchao-loongson@users.noreply.github.com>
|
||||
jwj7140 <32943891+jwj7140@users.noreply.github.com>
|
||||
k.h.lai <adrian.k.h.lai@outlook.com>
|
||||
kaizau <kaizau@users.noreply.github.com>
|
||||
@@ -925,6 +974,7 @@ ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
|
||||
luoyu-intel <yu.luo@intel.com>
|
||||
m3ndax <adrian.goessl@outlook.com>
|
||||
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
||||
magicse <magicse@users.noreply.github.com>
|
||||
mahorozte <41834471+mahorozte@users.noreply.github.com>
|
||||
makomk <makosoft@googlemail.com>
|
||||
manikbhandari <mbbhandarimanik2@gmail.com>
|
||||
@@ -935,6 +985,7 @@ matt23654 <matthew.webber@protonmail.com>
|
||||
matteo <matteogeniaccio@yahoo.it>
|
||||
mdrokz <mohammadmunshi@gmail.com>
|
||||
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
||||
midnight <midnightmagic@users.noreply.github.com>
|
||||
minarchist <minarchist@users.noreply.github.com>
|
||||
mj-shifu <77107165+mj-shifu@users.noreply.github.com>
|
||||
mmyjona <jonathan.gonse@gmail.com>
|
||||
@@ -958,10 +1009,12 @@ omahs <73983677+omahs@users.noreply.github.com>
|
||||
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
||||
opparco <parco.opaai@gmail.com>
|
||||
ostix360 <55257054+ostix360@users.noreply.github.com>
|
||||
pascal-lc <49066376+pascal-lc@users.noreply.github.com>
|
||||
pculliton <phillipculliton@gmail.com>
|
||||
peidaqi <peidaqi@gmail.com>
|
||||
pengxin99 <pengxin.yuan@intel.com>
|
||||
perserk <perserk@gmail.com>
|
||||
petterreinholdtsen <pere-github@hungry.com>
|
||||
piDack <104877312+piDack@users.noreply.github.com>
|
||||
pmysl <piotr.myslinski@outlook.com>
|
||||
postmasters <namnguyen@google.com>
|
||||
@@ -983,6 +1036,7 @@ semidark <me@semidark.net>
|
||||
serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
|
||||
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
||||
shibe2 <shibe@tuta.io>
|
||||
simon886212 <37953122+simon886212@users.noreply.github.com>
|
||||
singularity <12184989+singularity-s0@users.noreply.github.com>
|
||||
sjinzh <sjinzh@gmail.com>
|
||||
sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
|
||||
@@ -1000,10 +1054,12 @@ tarcey <cey.tarik@gmail.com>
|
||||
tc-mb <157115220+tc-mb@users.noreply.github.com>
|
||||
texmex76 <40733439+texmex76@users.noreply.github.com>
|
||||
thement <40525767+thement@users.noreply.github.com>
|
||||
theraininsky <76763719+theraininsky@users.noreply.github.com>
|
||||
thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
|
||||
tjohnman <tjohnman@users.noreply.github.com>
|
||||
toyer <2042519524@qq.com>
|
||||
tslmy <tslmy@users.noreply.github.com>
|
||||
tv1wnd <55383215+tv1wnd@users.noreply.github.com>
|
||||
ubik2 <ubik2@users.noreply.github.com>
|
||||
uint256_t <konndennsa@gmail.com>
|
||||
uint256_t <maekawatoshiki1017@gmail.com>
|
||||
@@ -1014,6 +1070,7 @@ valiray <133289098+valiray@users.noreply.github.com>
|
||||
vb <vaibhavs10@gmail.com>
|
||||
vik <vikhyatk@gmail.com>
|
||||
viric <viric@viric.name>
|
||||
vmobilis <75476228+vmobilis@users.noreply.github.com>
|
||||
vodkaslime <646329483@qq.com>
|
||||
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
||||
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
||||
@@ -1028,6 +1085,8 @@ wzy <32936898+Freed-Wu@users.noreply.github.com>
|
||||
xaedes <xaedes@gmail.com>
|
||||
xaedes <xaedes@googlemail.com>
|
||||
xctan <axunlei@gmail.com>
|
||||
xiaobing318 <71554036+xiaobing318@users.noreply.github.com>
|
||||
xiaofei <hbuxiaofei@gmail.com>
|
||||
xloem <0xloem@gmail.com>
|
||||
yangli2 <yangli2@gmail.com>
|
||||
ymcki <84055651+ymcki@users.noreply.github.com>
|
||||
|
||||
@@ -29,6 +29,8 @@ else()
|
||||
set(LLAMA_STANDALONE OFF)
|
||||
endif()
|
||||
|
||||
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||
|
||||
@@ -145,7 +147,13 @@ endif()
|
||||
# 3rd-party
|
||||
#
|
||||
|
||||
if (NOT TARGET ggml)
|
||||
if (LLAMA_USE_SYSTEM_GGML)
|
||||
message(STATUS "Using system-provided libggml, skipping ggml build")
|
||||
find_package(ggml REQUIRED)
|
||||
add_library(ggml ALIAS ggml::ggml)
|
||||
endif()
|
||||
|
||||
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
|
||||
add_subdirectory(ggml)
|
||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||
endif()
|
||||
|
||||
2
Makefile
2
Makefile
@@ -836,7 +836,7 @@ ifdef GGML_MUSA
|
||||
else
|
||||
MUSA_PATH ?= /opt/musa
|
||||
endif
|
||||
MUSA_ARCHITECTURES ?= 21;22
|
||||
MUSA_ARCHITECTURES ?= 21;22;31
|
||||
|
||||
MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
|
||||
MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
// swift-tools-version:5.5
|
||||
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "llama",
|
||||
platforms: [
|
||||
.macOS(.v12),
|
||||
.iOS(.v14),
|
||||
.watchOS(.v4),
|
||||
.tvOS(.v14)
|
||||
],
|
||||
products: [
|
||||
.library(name: "llama", targets: ["llama"]),
|
||||
],
|
||||
targets: [
|
||||
.systemLibrary(name: "llama", pkgConfig: "llama"),
|
||||
]
|
||||
)
|
||||
@@ -5,7 +5,7 @@
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
||||
|
||||
[Roadmap](https://github.com/users/ggml-org/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
||||
|
||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||
|
||||
@@ -25,7 +25,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
|
||||
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
|
||||
- Universal tool call support in `llama-server`: https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
|
||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
|
||||
@@ -157,6 +157,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
|
||||
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
|
||||
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
|
||||
- Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)
|
||||
|
||||
</details>
|
||||
|
||||
@@ -171,6 +172,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||
- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
|
||||
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
|
||||
- [johnbean393/Sidekick](https://github.com/johnbean393/Sidekick) (MIT)
|
||||
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
|
||||
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
||||
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <llama.h>
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
module llama [system] {
|
||||
header "llama.h"
|
||||
link "llama"
|
||||
export *
|
||||
}
|
||||
519
build-xcframework.sh
Executable file
519
build-xcframework.sh
Executable file
@@ -0,0 +1,519 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Options
|
||||
IOS_MIN_OS_VERSION=16.4
|
||||
MACOS_MIN_OS_VERSION=13.3
|
||||
VISIONOS_MIN_OS_VERSION=1.0
|
||||
TVOS_MIN_OS_VERSION=16.4
|
||||
|
||||
BUILD_SHARED_LIBS=OFF
|
||||
LLAMA_BUILD_EXAMPLES=OFF
|
||||
LLAMA_BUILD_TESTS=OFF
|
||||
LLAMA_BUILD_SERVER=OFF
|
||||
GGML_METAL=ON
|
||||
GGML_METAL_EMBED_LIBRARY=ON
|
||||
GGML_BLAS_DEFAULT=ON
|
||||
GGML_METAL_USE_BF16=ON
|
||||
GGML_OPENMP=OFF
|
||||
|
||||
COMMON_C_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
|
||||
COMMON_CXX_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
|
||||
|
||||
# Common options for all builds
|
||||
COMMON_CMAKE_ARGS=(
|
||||
-DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY=""
|
||||
-DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT="dwarf-with-dsym"
|
||||
-DCMAKE_XCODE_ATTRIBUTE_GCC_GENERATE_DEBUGGING_SYMBOLS=YES
|
||||
-DCMAKE_XCODE_ATTRIBUTE_COPY_PHASE_STRIP=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
|
||||
-DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
|
||||
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
|
||||
-DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
|
||||
-DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
|
||||
-DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
|
||||
-DGGML_METAL=${GGML_METAL}
|
||||
-DGGML_METAL_USE_BF16=${GGML_METAL_USE_BF16}
|
||||
-DGGML_NATIVE=OFF
|
||||
-DGGML_OPENMP=${GGML_OPENMP}
|
||||
)
|
||||
|
||||
check_required_tool() {
|
||||
local tool=$1
|
||||
local install_message=$2
|
||||
|
||||
if ! command -v $tool &> /dev/null; then
|
||||
echo "Error: $tool is required but not found."
|
||||
echo "$install_message"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
echo "Checking for required tools..."
|
||||
check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)"
|
||||
check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
|
||||
check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)"
|
||||
check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
|
||||
|
||||
set -e
|
||||
|
||||
## Clean up previous builds
|
||||
rm -rf build-apple
|
||||
rm -rf build-ios-sim
|
||||
rm -rf build-ios-device
|
||||
rm -rf build-macos
|
||||
rm -rf build-visionos
|
||||
rm -rf build-visionos-sim
|
||||
rm -rf build-tvos-sim
|
||||
rm -rf build-tvos-device
|
||||
|
||||
# Setup the xcframework build directory structure
|
||||
setup_framework_structure() {
|
||||
local build_dir=$1
|
||||
local min_os_version=$2
|
||||
local platform=$3 # "ios", "macos", "visionos", or "tvos"
|
||||
local framework_name="llama"
|
||||
|
||||
echo "Creating ${platform}-style framework structure for ${build_dir}"
|
||||
|
||||
if [[ "$platform" == "macos" ]]; then
|
||||
# macOS versioned structure uses versioned directories
|
||||
mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Headers
|
||||
mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Modules
|
||||
mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Resources
|
||||
|
||||
# Create symbolic links
|
||||
ln -sf A ${build_dir}/framework/${framework_name}.framework/Versions/Current
|
||||
ln -sf Versions/Current/Headers ${build_dir}/framework/${framework_name}.framework/Headers
|
||||
ln -sf Versions/Current/Modules ${build_dir}/framework/${framework_name}.framework/Modules
|
||||
ln -sf Versions/Current/Resources ${build_dir}/framework/${framework_name}.framework/Resources
|
||||
ln -sf Versions/Current/${framework_name} ${build_dir}/framework/${framework_name}.framework/${framework_name}
|
||||
|
||||
# Set header and module paths
|
||||
local header_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Headers/
|
||||
local module_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Modules/
|
||||
else
|
||||
# iOS/VisionOS/tvOS use a flat structure
|
||||
mkdir -p ${build_dir}/framework/${framework_name}.framework/Headers
|
||||
mkdir -p ${build_dir}/framework/${framework_name}.framework/Modules
|
||||
|
||||
# Remove any existing structure to ensure clean build
|
||||
rm -rf ${build_dir}/framework/${framework_name}.framework/Versions
|
||||
|
||||
# Set header and module paths
|
||||
local header_path=${build_dir}/framework/${framework_name}.framework/Headers/
|
||||
local module_path=${build_dir}/framework/${framework_name}.framework/Modules/
|
||||
fi
|
||||
|
||||
# Copy all required headers (common for all platforms)
|
||||
cp include/llama.h ${header_path}
|
||||
cp ggml/include/ggml.h ${header_path}
|
||||
cp ggml/include/ggml-alloc.h ${header_path}
|
||||
cp ggml/include/ggml-backend.h ${header_path}
|
||||
cp ggml/include/ggml-metal.h ${header_path}
|
||||
cp ggml/include/ggml-cpu.h ${header_path}
|
||||
cp ggml/include/ggml-blas.h ${header_path}
|
||||
cp ggml/include/gguf.h ${header_path}
|
||||
|
||||
# Create module map (common for all platforms)
|
||||
cat > ${module_path}module.modulemap << EOF
|
||||
framework module llama {
|
||||
header "llama.h"
|
||||
header "ggml.h"
|
||||
header "ggml-alloc.h"
|
||||
header "ggml-backend.h"
|
||||
header "ggml-metal.h"
|
||||
header "ggml-cpu.h"
|
||||
header "ggml-blas.h"
|
||||
header "gguf.h"
|
||||
|
||||
link "c++"
|
||||
link framework "Accelerate"
|
||||
link framework "Metal"
|
||||
link framework "Foundation"
|
||||
|
||||
export *
|
||||
}
|
||||
EOF
|
||||
|
||||
# Platform-specific settings for Info.plist
|
||||
local platform_name=""
|
||||
local sdk_name=""
|
||||
local supported_platform=""
|
||||
|
||||
case "$platform" in
|
||||
"ios")
|
||||
platform_name="iphoneos"
|
||||
sdk_name="iphoneos${min_os_version}"
|
||||
supported_platform="iPhoneOS"
|
||||
local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
|
||||
local device_family=' <key>UIDeviceFamily</key>
|
||||
<array>
|
||||
<integer>1</integer>
|
||||
<integer>2</integer>
|
||||
</array>'
|
||||
;;
|
||||
"macos")
|
||||
platform_name="macosx"
|
||||
sdk_name="macosx${min_os_version}"
|
||||
supported_platform="MacOSX"
|
||||
local plist_path="${build_dir}/framework/${framework_name}.framework/Versions/A/Resources/Info.plist"
|
||||
local device_family=""
|
||||
;;
|
||||
"visionos")
|
||||
platform_name="xros"
|
||||
sdk_name="xros${min_os_version}"
|
||||
supported_platform="XRPlatform"
|
||||
local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
|
||||
local device_family=""
|
||||
;;
|
||||
"tvos")
|
||||
platform_name="appletvos"
|
||||
sdk_name="appletvos${min_os_version}"
|
||||
supported_platform="AppleTVOS"
|
||||
local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
|
||||
local device_family=' <key>UIDeviceFamily</key>
|
||||
<array>
|
||||
<integer>3</integer>
|
||||
</array>'
|
||||
;;
|
||||
esac
|
||||
|
||||
# Create Info.plist
|
||||
cat > ${plist_path} << EOF
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>CFBundleDevelopmentRegion</key>
|
||||
<string>en</string>
|
||||
<key>CFBundleExecutable</key>
|
||||
<string>llama</string>
|
||||
<key>CFBundleIdentifier</key>
|
||||
<string>org.ggml.llama</string>
|
||||
<key>CFBundleInfoDictionaryVersion</key>
|
||||
<string>6.0</string>
|
||||
<key>CFBundleName</key>
|
||||
<string>llama</string>
|
||||
<key>CFBundlePackageType</key>
|
||||
<string>FMWK</string>
|
||||
<key>CFBundleShortVersionString</key>
|
||||
<string>1.0</string>
|
||||
<key>CFBundleVersion</key>
|
||||
<string>1</string>
|
||||
<key>MinimumOSVersion</key>
|
||||
<string>${min_os_version}</string>
|
||||
<key>CFBundleSupportedPlatforms</key>
|
||||
<array>
|
||||
<string>${supported_platform}</string>
|
||||
</array>${device_family}
|
||||
<key>DTPlatformName</key>
|
||||
<string>${platform_name}</string>
|
||||
<key>DTSDKName</key>
|
||||
<string>${sdk_name}</string>
|
||||
</dict>
|
||||
</plist>
|
||||
EOF
|
||||
}
|
||||
|
||||
# Create dynamic libraries from static libraries.
|
||||
combine_static_libraries() {
|
||||
local build_dir="$1"
|
||||
local release_dir="$2"
|
||||
local platform="$3" # "ios", "macos", "visionos", or "tvos"
|
||||
local is_simulator="$4"
|
||||
local base_dir="$(pwd)"
|
||||
local framework_name="llama"
|
||||
|
||||
# Determine output path based on platform
|
||||
local output_lib=""
|
||||
if [[ "$platform" == "macos" ]]; then
|
||||
# macOS uses versioned structure
|
||||
output_lib="${build_dir}/framework/${framework_name}.framework/Versions/A/${framework_name}"
|
||||
else
|
||||
# iOS, visionOS, and tvOS use a directory flat structure
|
||||
output_lib="${build_dir}/framework/${framework_name}.framework/${framework_name}"
|
||||
fi
|
||||
|
||||
local libs=(
|
||||
"${base_dir}/${build_dir}/src/${release_dir}/libllama.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-base.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
|
||||
)
|
||||
|
||||
# Create temporary directory for processing
|
||||
local temp_dir="${base_dir}/${build_dir}/temp"
|
||||
mkdir -p "${temp_dir}"
|
||||
|
||||
# Since we have multiple architectures libtool will find object files that do not
|
||||
# match the target architecture. We suppress these warnings.
|
||||
libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
|
||||
|
||||
# Determine SDK, architectures, and install_name based on platform and simulator flag.
|
||||
local sdk=""
|
||||
local archs=""
|
||||
local min_version_flag=""
|
||||
local install_name=""
|
||||
|
||||
case "$platform" in
|
||||
"ios")
|
||||
if [[ "$is_simulator" == "true" ]]; then
|
||||
sdk="iphonesimulator"
|
||||
archs="arm64 x86_64"
|
||||
min_version_flag="-mios-simulator-version-min=${IOS_MIN_OS_VERSION}"
|
||||
else
|
||||
sdk="iphoneos"
|
||||
archs="arm64"
|
||||
min_version_flag="-mios-version-min=${IOS_MIN_OS_VERSION}"
|
||||
fi
|
||||
install_name="@rpath/llama.framework/llama"
|
||||
;;
|
||||
"macos")
|
||||
sdk="macosx"
|
||||
archs="arm64 x86_64"
|
||||
min_version_flag="-mmacosx-version-min=${MACOS_MIN_OS_VERSION}"
|
||||
install_name="@rpath/llama.framework/Versions/Current/llama"
|
||||
;;
|
||||
"visionos")
|
||||
if [[ "$is_simulator" == "true" ]]; then
|
||||
sdk="xrsimulator"
|
||||
archs="arm64 x86_64"
|
||||
min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}-simulator"
|
||||
else
|
||||
sdk="xros"
|
||||
archs="arm64"
|
||||
min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}"
|
||||
fi
|
||||
# Use flat structure for visionOS, same as iOS
|
||||
install_name="@rpath/llama.framework/llama"
|
||||
;;
|
||||
"tvos")
|
||||
if [[ "$is_simulator" == "true" ]]; then
|
||||
sdk="appletvsimulator"
|
||||
archs="arm64 x86_64"
|
||||
min_version_flag="-mtvos-simulator-version-min=${TVOS_MIN_OS_VERSION}"
|
||||
else
|
||||
sdk="appletvos"
|
||||
archs="arm64"
|
||||
min_version_flag="-mtvos-version-min=${TVOS_MIN_OS_VERSION}"
|
||||
fi
|
||||
install_name="@rpath/llama.framework/llama"
|
||||
;;
|
||||
esac
|
||||
|
||||
# Build architecture flags
|
||||
local arch_flags=""
|
||||
for arch in $archs; do
|
||||
arch_flags+=" -arch $arch"
|
||||
done
|
||||
|
||||
# Create dynamic library
|
||||
echo "Creating dynamic library for ${platform}."
|
||||
xcrun -sdk $sdk clang++ -dynamiclib \
|
||||
-isysroot $(xcrun --sdk $sdk --show-sdk-path) \
|
||||
$arch_flags \
|
||||
$min_version_flag \
|
||||
-Wl,-force_load,"${temp_dir}/combined.a" \
|
||||
-framework Foundation -framework Metal -framework Accelerate \
|
||||
-install_name "$install_name" \
|
||||
-o "${base_dir}/${output_lib}"
|
||||
|
||||
# Platform-specific post-processing for device builds
|
||||
if [[ "$is_simulator" == "false" ]]; then
|
||||
if command -v vtool &>/dev/null; then
|
||||
case "$platform" in
|
||||
"ios")
|
||||
echo "Marking binary as a framework binary for iOS..."
|
||||
vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
"visionos")
|
||||
echo "Marking binary as a framework binary for visionOS..."
|
||||
vtool -set-build-version xros ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
"tvos")
|
||||
echo "Marking binary as a framework binary for tvOS..."
|
||||
vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
esac
|
||||
else
|
||||
echo "Warning: vtool not found. Binary may not pass App Store validation."
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Creating properly formatted dSYM..."
|
||||
# Create a separate directory for dSYMs for all platforms
|
||||
mkdir -p "${base_dir}/${build_dir}/dSYMs"
|
||||
|
||||
# iOS and visionOS style dSYM (flat structure)
|
||||
if [[ "$platform" == "ios" || "$platform" == "visionos" || "$platform" == "tvos" ]]; then
|
||||
# Generate dSYM in the dSYMs directory
|
||||
xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
|
||||
|
||||
# Create a copy of the binary that will be stripped
|
||||
cp "${base_dir}/${output_lib}" "${temp_dir}/binary_to_strip"
|
||||
|
||||
# Strip debug symbols from the copy
|
||||
xcrun strip -S "${temp_dir}/binary_to_strip" -o "${temp_dir}/stripped_lib"
|
||||
|
||||
# Replace the original with the stripped version
|
||||
mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
|
||||
else
|
||||
# macOS style dSYM
|
||||
# First strip debug info to a separate file
|
||||
xcrun strip -S "${base_dir}/${output_lib}" -o "${temp_dir}/stripped_lib"
|
||||
|
||||
# Generate dSYM in the dSYMs directory
|
||||
xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
|
||||
|
||||
# Replace original binary with stripped version
|
||||
mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
|
||||
fi
|
||||
|
||||
# Remove any automatically generated dSYM files in the framework structure as they will
|
||||
# otherwise case Invalid Bundle Structure validation errors.
|
||||
if [ -d "${base_dir}/${output_lib}.dSYM" ]; then
|
||||
echo "Removing generated dSYM file in framework structure: ${base_dir}/${output_lib}.dSYM"
|
||||
rm -rf "${base_dir}/${output_lib}.dSYM"
|
||||
fi
|
||||
|
||||
# Clean up
|
||||
rm -rf "${temp_dir}"
|
||||
}
|
||||
|
||||
echo "Building for iOS simulator..."
|
||||
cmake -B build-ios-sim -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
|
||||
-DIOS=ON \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_SYSROOT=iphonesimulator \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-ios-sim --config Release -- -quiet
|
||||
|
||||
echo "Building for iOS devices..."
|
||||
cmake -B build-ios-device -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_OSX_SYSROOT=iphoneos \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64" \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-ios-device --config Release -- -quiet
|
||||
|
||||
echo "Building for macOS..."
|
||||
cmake -B build-macos -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-macos --config Release -- -quiet
|
||||
|
||||
echo "Building for visionOS..."
|
||||
cmake -B build-visionos -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64" \
|
||||
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||
-DCMAKE_OSX_SYSROOT=xros \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-visionos --config Release -- -quiet
|
||||
|
||||
echo "Building for visionOS simulator..."
|
||||
cmake -B build-visionos-sim -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
|
||||
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||
-DCMAKE_OSX_SYSROOT=xrsimulator \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-visionos-sim --config Release -- -quiet
|
||||
|
||||
# Add tvOS builds (might need the same u_int definitions as watchOS and visionOS)
|
||||
echo "Building for tvOS simulator..."
|
||||
cmake -B build-tvos-sim -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||
-DCMAKE_OSX_SYSROOT=appletvsimulator \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
|
||||
-DGGML_METAL=ON \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-tvos-sim --config Release -- -quiet
|
||||
|
||||
echo "Building for tvOS devices..."
|
||||
cmake -B build-tvos-device -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||
-DCMAKE_OSX_SYSROOT=appletvos \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64" \
|
||||
-DGGML_METAL=ON \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-tvos-device --config Release -- -quiet
|
||||
|
||||
# Setup frameworks and copy binaries and headers
|
||||
echo "Setting up framework structures..."
|
||||
setup_framework_structure "build-ios-sim" ${IOS_MIN_OS_VERSION} "ios"
|
||||
setup_framework_structure "build-ios-device" ${IOS_MIN_OS_VERSION} "ios"
|
||||
setup_framework_structure "build-macos" ${MACOS_MIN_OS_VERSION} "macos"
|
||||
setup_framework_structure "build-visionos" ${VISIONOS_MIN_OS_VERSION} "visionos"
|
||||
setup_framework_structure "build-visionos-sim" ${VISIONOS_MIN_OS_VERSION} "visionos"
|
||||
setup_framework_structure "build-tvos-sim" ${TVOS_MIN_OS_VERSION} "tvos"
|
||||
setup_framework_structure "build-tvos-device" ${TVOS_MIN_OS_VERSION} "tvos"
|
||||
|
||||
# Create dynamic libraries from static libraries
|
||||
echo "Creating dynamic libraries from static libraries..."
|
||||
combine_static_libraries "build-ios-sim" "Release-iphonesimulator" "ios" "true"
|
||||
combine_static_libraries "build-ios-device" "Release-iphoneos" "ios" "false"
|
||||
combine_static_libraries "build-macos" "Release" "macos" "false"
|
||||
combine_static_libraries "build-visionos" "Release-xros" "visionos" "false"
|
||||
combine_static_libraries "build-visionos-sim" "Release-xrsimulator" "visionos" "true"
|
||||
combine_static_libraries "build-tvos-sim" "Release-appletvsimulator" "tvos" "true"
|
||||
combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false"
|
||||
|
||||
# Create XCFramework with correct debug symbols paths
|
||||
echo "Creating XCFramework..."
|
||||
xcodebuild -create-xcframework \
|
||||
-framework $(pwd)/build-ios-sim/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \
|
||||
-framework $(pwd)/build-ios-device/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \
|
||||
-framework $(pwd)/build-macos/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \
|
||||
-framework $(pwd)/build-visionos/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \
|
||||
-framework $(pwd)/build-visionos-sim/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-visionos-sim/dSYMs/llama.dSYM \
|
||||
-framework $(pwd)/build-tvos-device/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-tvos-device/dSYMs/llama.dSYM \
|
||||
-framework $(pwd)/build-tvos-sim/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-tvos-sim/dSYMs/llama.dSYM \
|
||||
-output $(pwd)/build-apple/llama.xcframework
|
||||
@@ -352,10 +352,10 @@ function gg_run_open_llama_7b_v2 {
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
include("ggml/cmake/common.cmake")
|
||||
|
||||
function(llama_add_compile_flags)
|
||||
if (LLAMA_FATAL_WARNINGS)
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
|
||||
@@ -764,7 +764,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_env("LLAMA_ARG_CTX_SIZE"));
|
||||
add_opt(common_arg(
|
||||
{"-n", "--predict", "--n-predict"}, "N",
|
||||
string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
|
||||
string_format(
|
||||
ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
|
||||
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
|
||||
: "number of tokens to predict (default: %d, -1 = infinity)",
|
||||
params.n_predict),
|
||||
[](common_params & params, int value) {
|
||||
params.n_predict = value;
|
||||
}
|
||||
@@ -849,6 +853,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
}
|
||||
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-sysf", "--system-prompt-file"}, "FNAME",
|
||||
"a file containing the system prompt (default: none)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream file(value);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||
}
|
||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
|
||||
if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
|
||||
params.system_prompt.pop_back();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"--in-file"}, "FNAME",
|
||||
"an input file (repeat to specify multiple files)",
|
||||
@@ -949,6 +967,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"-st", "--single-turn"},
|
||||
"run conversation for a single turn only, then exit when done\n"
|
||||
"will not be interactive if first turn is predefined with --prompt\n"
|
||||
"(default: false)",
|
||||
[](common_params & params) {
|
||||
params.single_turn = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"-i", "--interactive"},
|
||||
string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
|
||||
@@ -1858,18 +1885,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
||||
add_opt(common_arg(
|
||||
{"-o", "--output", "--output-file"}, "FNAME",
|
||||
string_format("output file (default: '%s')",
|
||||
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
||||
? params.lora_outfile.c_str()
|
||||
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
|
||||
? params.cvector_outfile.c_str()
|
||||
: params.out_file.c_str()),
|
||||
string_format("output file (default: '%s')", params.out_file.c_str()),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.out_file = value;
|
||||
params.cvector_outfile = value;
|
||||
params.lora_outfile = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
|
||||
add_opt(common_arg(
|
||||
{"-ofreq", "--output-frequency"}, "N",
|
||||
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
||||
@@ -2562,5 +2582,43 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
|
||||
add_opt(common_arg(
|
||||
{"--fim-qwen-7b-spec"},
|
||||
string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
||||
[](common_params & params) {
|
||||
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
||||
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
||||
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
||||
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
||||
params.speculative.n_gpu_layers = 99;
|
||||
params.port = 8012;
|
||||
params.n_gpu_layers = 99;
|
||||
params.flash_attn = true;
|
||||
params.n_ubatch = 1024;
|
||||
params.n_batch = 1024;
|
||||
params.n_ctx = 0;
|
||||
params.n_cache_reuse = 256;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
|
||||
add_opt(common_arg(
|
||||
{"--fim-qwen-14b-spec"},
|
||||
string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
||||
[](common_params & params) {
|
||||
params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
|
||||
params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
|
||||
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
||||
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
||||
params.speculative.n_gpu_layers = 99;
|
||||
params.port = 8012;
|
||||
params.n_gpu_layers = 99;
|
||||
params.flash_attn = true;
|
||||
params.n_ubatch = 1024;
|
||||
params.n_batch = 1024;
|
||||
params.n_ctx = 0;
|
||||
params.n_cache_reuse = 256;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
|
||||
return ctx_arg;
|
||||
}
|
||||
|
||||
566
common/chat.cpp
566
common/chat.cpp
@@ -60,7 +60,9 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||
}
|
||||
msg.role = message.at("role");
|
||||
|
||||
if (message.contains("content")) {
|
||||
auto has_content = message.contains("content");
|
||||
auto has_tool_calls = message.contains("tool_calls");
|
||||
if (has_content) {
|
||||
const auto & content = message.at("content");
|
||||
if (content.is_string()) {
|
||||
msg.content = content;
|
||||
@@ -81,19 +83,8 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||
} else if (!content.is_null()) {
|
||||
throw std::runtime_error("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("Expected 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
||||
}
|
||||
if (message.contains("reasoning_content")) {
|
||||
msg.reasoning_content = message.at("reasoning_content");
|
||||
}
|
||||
if (message.contains("name")) {
|
||||
msg.tool_name = message.at("name");
|
||||
}
|
||||
if (message.contains("tool_call_id")) {
|
||||
msg.tool_call_id = message.at("tool_call_id");
|
||||
}
|
||||
if (message.contains("tool_calls")) {
|
||||
if (has_tool_calls) {
|
||||
for (const auto & tool_call : message.at("tool_calls")) {
|
||||
common_chat_tool_call tc;
|
||||
if (!tool_call.contains("type")) {
|
||||
@@ -118,6 +109,18 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||
msg.tool_calls.push_back(tc);
|
||||
}
|
||||
}
|
||||
if (!has_content && !has_tool_calls) {
|
||||
throw std::runtime_error("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
|
||||
}
|
||||
if (message.contains("reasoning_content")) {
|
||||
msg.reasoning_content = message.at("reasoning_content");
|
||||
}
|
||||
if (message.contains("name")) {
|
||||
msg.tool_name = message.at("name");
|
||||
}
|
||||
if (message.contains("tool_call_id")) {
|
||||
msg.tool_call_id = message.at("tool_call_id");
|
||||
}
|
||||
|
||||
msgs.push_back(msg);
|
||||
}
|
||||
@@ -442,6 +445,7 @@ std::string common_chat_format_name(common_chat_format format) {
|
||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
|
||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
||||
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
||||
case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING: return "Hermes 2 Pro (extract reasoning)";
|
||||
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
||||
case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return "Command R7B (extract reasoning)";
|
||||
default:
|
||||
@@ -449,12 +453,6 @@ std::string common_chat_format_name(common_chat_format format) {
|
||||
}
|
||||
}
|
||||
|
||||
const common_grammar_options grammar_options {
|
||||
/* .dotall = */ false,
|
||||
/* .compact_spaces = */ false,
|
||||
// /* .compact_spaces = */ true,
|
||||
};
|
||||
|
||||
static bool parse_json(std::string::const_iterator & it, const std::string::const_iterator & end, json & out) {
|
||||
// // https://json.nlohmann.me/features/parsing/sax_interface/
|
||||
struct json_error_locator : public nlohmann::json_sax<json> {
|
||||
@@ -500,6 +498,34 @@ static bool parse_json(std::string::const_iterator & it, const std::string::cons
|
||||
}
|
||||
}
|
||||
|
||||
static bool parse_literal(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
|
||||
auto expected_it = expected.begin();
|
||||
auto tmp_it = it;
|
||||
while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
|
||||
++tmp_it;
|
||||
++expected_it;
|
||||
}
|
||||
if (expected_it == expected.end()) {
|
||||
it = tmp_it;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static std::optional<std::smatch> parse_pattern(std::string::const_iterator & it, const std::string::const_iterator & end, const std::regex & expected) {
|
||||
std::smatch match;
|
||||
if (std::regex_match(it, end, match, expected)) {
|
||||
it = match.suffix().first;
|
||||
return match;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
static void consume_spaces(std::string::const_iterator & it, const std::string::const_iterator & end) {
|
||||
while (it != end && std::isspace(*it)) {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
|
||||
@@ -509,7 +535,8 @@ static common_chat_msg parse_json_tool_calls(
|
||||
const std::string& input,
|
||||
const std::optional<std::regex> & trigger_opt,
|
||||
const std::regex & function_regex,
|
||||
const std::regex & close_regex) {
|
||||
const std::regex & close_regex,
|
||||
bool allow_raw_python = false) {
|
||||
std::smatch match;
|
||||
|
||||
common_chat_msg result;
|
||||
@@ -540,14 +567,19 @@ static common_chat_msg parse_json_tool_calls(
|
||||
it = rit->suffix().first;
|
||||
|
||||
json arguments;
|
||||
if (!parse_json(it, end, arguments)) {
|
||||
if (parse_json(it, end, arguments)) {
|
||||
if (!std::regex_search(it, end, match, close_regex)) {
|
||||
throw std::runtime_error("Malformed input, missing closing pattern: " + input);
|
||||
}
|
||||
it = match.suffix().first;
|
||||
result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
|
||||
} else {
|
||||
if (allow_raw_python && name == "python") {
|
||||
result.tool_calls.push_back({name, json({{"code", std::string(it, end)}}).dump(), /* id= */ ""});
|
||||
break;
|
||||
}
|
||||
throw std::runtime_error("Failed to parse json tool call arguments: " + input);
|
||||
}
|
||||
if (!std::regex_search(it, end, match, close_regex)) {
|
||||
throw std::runtime_error("Malformed input, missing closing pattern: " + input);
|
||||
}
|
||||
it = match.suffix().first;
|
||||
result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
|
||||
}
|
||||
|
||||
if (!result.tool_calls.empty()) {
|
||||
@@ -559,29 +591,29 @@ static common_chat_msg parse_json_tool_calls(
|
||||
return result;
|
||||
}
|
||||
|
||||
static common_chat_tool_call process_tool_call(const json & tool_call) {
|
||||
const auto & arguments = tool_call.at("arguments");
|
||||
return {
|
||||
/* .name = */ tool_call.at("name"),
|
||||
/* .arguments = */ arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
|
||||
/* .id = */ tool_call.contains("id") ? tool_call.at("id") : "",
|
||||
};
|
||||
}
|
||||
static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& input, const std::string & prefix, size_t rstrip_prefix = 0) {
|
||||
auto content_end = input.find(prefix);
|
||||
size_t tc_start = std::string::npos;
|
||||
|
||||
common_chat_msg result;
|
||||
result.role = "assistant";
|
||||
const auto process_tool_calls = [&](const json & tool_calls) {
|
||||
for (const auto & tool_call : tool_calls) {
|
||||
const auto & arguments = tool_call.at("arguments");
|
||||
result.tool_calls.push_back({
|
||||
tool_call.at("name"),
|
||||
arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
|
||||
tool_call.contains("id") ? tool_call.at("id") : "",
|
||||
});
|
||||
}
|
||||
};
|
||||
if (content_end == std::string::npos) {
|
||||
result.content = input;
|
||||
} else {
|
||||
tc_start = content_end + prefix.size() - rstrip_prefix;
|
||||
result.content = input.substr(0, content_end);
|
||||
auto tool_calls = json::parse(input.substr(tc_start));
|
||||
process_tool_calls(tool_calls);
|
||||
for (const auto & tool_call : tool_calls) {
|
||||
result.tool_calls.emplace_back(process_tool_call(tool_call));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@@ -700,7 +732,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
|
||||
data.grammar_lazy = false;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
builder.add_schema("root", schema);
|
||||
}, grammar_options);
|
||||
});
|
||||
|
||||
auto tweaked_messages = common_chat_template::add_system(
|
||||
inputs.messages,
|
||||
@@ -770,8 +802,11 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
|
||||
schema["maxItems"] = 1;
|
||||
}
|
||||
builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
|
||||
}, grammar_options);
|
||||
data.grammar_triggers.push_back({"[TOOL_CALLS]", /* .at_start = */ true});
|
||||
});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
|
||||
data.preserved_tokens = {
|
||||
"[TOOL_CALLS]",
|
||||
};
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
|
||||
return data;
|
||||
@@ -813,14 +848,18 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
|
||||
schema["maxItems"] = 1;
|
||||
}
|
||||
builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
|
||||
}, grammar_options);
|
||||
data.grammar_triggers.push_back({"<|START_ACTION|>", /* .at_start = */ false});
|
||||
});
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
||||
"<|START_ACTION|>",
|
||||
});
|
||||
data.preserved_tokens = {
|
||||
"<|START_ACTION|>",
|
||||
"<|END_ACTION|>",
|
||||
"<|START_RESPONSE|>",
|
||||
"<|END_RESPONSE|>",
|
||||
"<|START_THINKING|>",
|
||||
"<|END_THINKING|>",
|
||||
"<|END_ACTION|>",
|
||||
};
|
||||
auto adjusted_messages = json::array();
|
||||
for (const auto & msg : inputs.messages) {
|
||||
@@ -840,9 +879,9 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
|
||||
return data;
|
||||
}
|
||||
static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool extract_reasoning) {
|
||||
static std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|>)([\\s\\S\\n\\r]*)");
|
||||
static std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
|
||||
static std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
|
||||
static const std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S]*?)<\\|END_THINKING\\|>)([\\s\\S]*)");
|
||||
static const std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S]*?)<\\|END_ACTION\\|>");
|
||||
static const std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S]*?)<\\|END_RESPONSE\\|>");
|
||||
|
||||
std::smatch match;
|
||||
|
||||
@@ -945,23 +984,23 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
|
||||
builder.add_rule(
|
||||
name + "-call",
|
||||
"\"{\" space "
|
||||
"( \"\\\"type\\\":\" space \"\\\"function\\\",\" space )? "
|
||||
"\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " +
|
||||
builder.add_schema(name + "-args", parameters) +
|
||||
" \"}\""));
|
||||
data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true});
|
||||
"( \"\\\"type\\\"\" space \":\" space \"\\\"function\\\"\" space \",\" space )? "
|
||||
" \"\\\"name\\\"\" space \":\" space \"\\\"" + name + "\\\"\" space \",\" space "
|
||||
" \"\\\"parameters\\\"\" space \":\" space " + builder.add_schema(name + "-args", parameters) + " "
|
||||
"\"}\" space"));
|
||||
});
|
||||
// Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name.
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
|
||||
"\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*",
|
||||
});
|
||||
data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
|
||||
if (!builtin_tools.empty()) {
|
||||
data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
|
||||
data.preserved_tokens.push_back("<|python_tag|>");
|
||||
}
|
||||
// Allow a few empty lines on top of the usual constrained json schema space rule.
|
||||
builder.add_rule("root", string_join(tool_rules, " | "));
|
||||
}, grammar_options);
|
||||
});
|
||||
data.additional_stops.push_back("<|eom_id|>");
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
|
||||
{"tools_in_user_message", false},
|
||||
@@ -974,33 +1013,33 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
|
||||
}
|
||||
static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
|
||||
// TODO: tighten & simplify the parser, don't accept leading text context.
|
||||
static std::regex function_regex("\\{[\\s\\n\\r]*(?:\"type\"[\\s\\n\\r]*:[\\s\\n\\r]*\"function\"[\\s\\n\\r]*,[\\s\\n\\r]*|[\\s\\n\\r]*)\"name\"[\\s\\n\\r]*:[\\s\\n\\r]*\"([^\"]+)\"[\\s\\n\\r]*,[\\s\\n\\r]*\"parameters\": ");
|
||||
static std::regex close_regex("\\}");
|
||||
static std::regex builtin_call_regex("<\\|python_tag\\|>([^.(]+)\\.call\\((.*)\\)");
|
||||
static const std::regex function_regex(
|
||||
"\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: ");
|
||||
static const std::regex close_regex("\\}\\s*");
|
||||
static const std::regex builtin_call_regex("<\\|python_tag\\|>\\s*([^.(]+)\\s*\\.\\s*call\\s*\\(\\s*([\\w]+)\\s*=\\s*([\\s\\S]*?)\\)");
|
||||
|
||||
if (with_builtin_tools) {
|
||||
std::smatch match;
|
||||
if (std::regex_match(input, match, builtin_call_regex)) {
|
||||
auto name = match[1].str();
|
||||
auto raw_args = match[2].str();
|
||||
try {
|
||||
auto name = match[1].str();
|
||||
auto arg_name = match[2].str();
|
||||
auto arg_value_str = match[3].str();
|
||||
auto arg_value = json::parse(arg_value_str);
|
||||
|
||||
// TODO: if/when builtin tools start accepting more than 1 argument, use parse_json for real parsing.
|
||||
auto it_eq = raw_args.find('=');
|
||||
auto arg_name = raw_args.substr(0, it_eq);
|
||||
auto arg_value_str = raw_args.substr(it_eq + 1);
|
||||
auto arg_value = json::parse(arg_value_str);
|
||||
|
||||
common_chat_msg msg;
|
||||
msg.role = "assistant";
|
||||
msg.content = match.prefix().str();
|
||||
msg.tool_calls.push_back({
|
||||
/* .name = */ name,
|
||||
/* .arguments = */ (json {
|
||||
{arg_name, arg_value},
|
||||
}).dump(),
|
||||
/* .id = */ "",
|
||||
});
|
||||
return msg;
|
||||
common_chat_msg msg;
|
||||
msg.role = "assistant";
|
||||
msg.tool_calls.push_back({
|
||||
/* .name = */ name,
|
||||
/* .arguments = */ (json {
|
||||
{arg_name, arg_value},
|
||||
}).dump(),
|
||||
/* .id = */ "",
|
||||
});
|
||||
return msg;
|
||||
} catch (const std::exception & e) {
|
||||
LOG_WRN("Failed to parse builtin tool call arguments (%s): %s", e.what(), input.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
|
||||
@@ -1017,10 +1056,10 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
|
||||
std::string name = function.at("name");
|
||||
auto parameters = function.at("parameters");
|
||||
builder.resolve_refs(parameters);
|
||||
auto args_rule = builder.add_schema(name + "-args", parameters);
|
||||
tool_rules.push_back(builder.add_rule(name + "-call",
|
||||
"\"<|tool▁call▁begin|>function<|tool▁sep|>" + name + "\\n"
|
||||
"```json\\n\" " + args_rule + " \"```<|tool▁call▁end|>\""));
|
||||
"```json\\n\" " + builder.add_schema(name + "-args", parameters) + " "
|
||||
"\"```<|tool▁call▁end|>\""));
|
||||
});
|
||||
// Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
|
||||
// so we accept common variants (then it's all constrained)
|
||||
@@ -1029,18 +1068,20 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
|
||||
"(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
|
||||
"\"<|tool▁calls▁end|>\""
|
||||
" space");
|
||||
data.grammar_triggers.push_back({"<|tool▁calls▁begin|>", /* .at_start = */ false});
|
||||
data.grammar_triggers.push_back({"<|tool_calls_begin|>", /* .at_start = */ false});
|
||||
data.grammar_triggers.push_back({"<|tool calls begin|>", /* .at_start = */ false});
|
||||
data.grammar_triggers.push_back({"<|tool\\_calls\\_begin|>", /* .at_start = */ false});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool▁calls▁begin|>"});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool_calls_begin|>"});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool calls begin|>"});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool\\_calls\\_begin|>"});
|
||||
data.preserved_tokens = {
|
||||
"<think>",
|
||||
"</think>",
|
||||
"<|tool▁calls▁begin|>",
|
||||
"<|tool▁call▁begin|>",
|
||||
"<|tool▁sep|>",
|
||||
"<|tool▁calls▁end|",
|
||||
"<|tool▁call▁end|>",
|
||||
"<|tool▁calls▁end|",
|
||||
};
|
||||
}, grammar_options);
|
||||
});
|
||||
}
|
||||
auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
|
||||
@@ -1065,34 +1106,42 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
|
||||
data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING : COMMON_CHAT_FORMAT_DEEPSEEK_R1;
|
||||
return data;
|
||||
}
|
||||
static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) {
|
||||
static std::regex function_regex("<|tool▁call▁begin|>function<|tool▁sep|>([^\n]+)\n```json\n");
|
||||
static std::regex close_regex("```[\\s\\r\\n]*<|tool▁call▁end|>");
|
||||
static std::regex reasoning_content_regex("((?:<think>)?([\\s\\S\\r\\n]*?)</think>)?([\\s\\S\\r\\n]*)");
|
||||
static std::regex tool_calls_regex("[\\s\\r\\n]*(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>)([\\s\\S\\r\\n]*?)<|tool▁calls▁end|>");
|
||||
common_chat_msg msg;
|
||||
msg.role = "assistant";
|
||||
static common_chat_msg handle_think_tag_prelude(const std::string & input, bool extract_reasoning, const std::function<common_chat_msg(const std::string &)> & rest_parser) {
|
||||
std::smatch match;
|
||||
static const std::regex reasoning_content_regex("((?:<think>)?([\\s\\S\\r\\n]*?)</think>)?([\\s\\S\\r\\n]*)");
|
||||
if (std::regex_match(input, match, reasoning_content_regex)) {
|
||||
std::string rest;
|
||||
auto rest = match[3].str();
|
||||
auto msg = rest_parser(rest);
|
||||
auto reasoning_content = string_strip(match[2].str());
|
||||
if (extract_reasoning) {
|
||||
msg.reasoning_content = string_strip(match[2].str());
|
||||
} else {
|
||||
msg.content = match[1].str();
|
||||
msg.reasoning_content = reasoning_content;
|
||||
} else if (!reasoning_content.empty()) {
|
||||
std::ostringstream content;
|
||||
content << "<think>" << reasoning_content << "</think>" << msg.content;
|
||||
msg.content = content.str();
|
||||
}
|
||||
rest = match[3].str();
|
||||
return msg;
|
||||
}
|
||||
return rest_parser(input);
|
||||
}
|
||||
static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) {
|
||||
return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) {
|
||||
static const std::regex function_regex("<|tool▁call▁begin|>function<|tool▁sep|>([^\n]+)\n```json\n");
|
||||
static const std::regex close_regex("```[\\s\\r\\n]*<|tool▁call▁end|>");
|
||||
static const std::regex tool_calls_regex("[\\s\\r\\n]*(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>)([\\s\\S\\r\\n]*?)<|tool▁calls▁end|>");
|
||||
|
||||
if (std::regex_search(rest, match, tool_calls_regex)) {
|
||||
common_chat_msg msg;
|
||||
msg.role = "assistant";
|
||||
std::smatch match;
|
||||
if (std::regex_search(input, match, tool_calls_regex)) {
|
||||
auto tool_calls = match[1].str();
|
||||
auto msg2 = parse_json_tool_calls(tool_calls, std::nullopt, function_regex, close_regex);
|
||||
msg.tool_calls = std::move(msg2.tool_calls);
|
||||
} else {
|
||||
msg.content += std::string(rest.begin() + rest.find_first_not_of(" \r\n"), rest.end());
|
||||
msg.content = input;
|
||||
}
|
||||
} else {
|
||||
msg.content = input;
|
||||
}
|
||||
return msg;
|
||||
return msg;
|
||||
});
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
@@ -1129,8 +1178,11 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c
|
||||
schema["maxItems"] = 1;
|
||||
}
|
||||
builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema));
|
||||
}, grammar_options);
|
||||
data.grammar_triggers.push_back({" functools[", /* .at_start = */ false});
|
||||
});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, " functools["});
|
||||
data.preserved_tokens = {
|
||||
" functools[",
|
||||
};
|
||||
data.format = COMMON_CHAT_FORMAT_FIREFUNCTION_V2;
|
||||
} else {
|
||||
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
||||
@@ -1158,11 +1210,28 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
|
||||
auto parameters = function.at("parameters");
|
||||
builder.resolve_refs(parameters);
|
||||
auto args_rule = builder.add_schema(name + "-args", parameters);
|
||||
first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule));
|
||||
first_tool_rules.push_back(builder.add_rule(name + "-call", "( \"assistant<|end_header_id|>\\n\" )? \"" + name + "\\n\" " + args_rule));
|
||||
subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
|
||||
data.grammar_triggers.push_back({name, /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({">>>" + name, /* .at_start = */ false});
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
|
||||
regex_escape(name + "\n"),
|
||||
});
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
|
||||
regex_escape("assistant<|end_header_id|>\n" + name + "\n"),
|
||||
});
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
||||
regex_escape(">>>" + name + "\n"),
|
||||
});
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
||||
">>>assistant<|end_header_id|>\n" + name,
|
||||
});
|
||||
});
|
||||
data.preserved_tokens = {
|
||||
"<|end_header_id|>",
|
||||
};
|
||||
auto first_rule = first_tool_rules.empty() ? "" : builder.add_rule("first_tool_call", string_join(first_tool_rules, " | ")) + " space";
|
||||
if (inputs.parallel_tool_calls) {
|
||||
auto subsequent_rule = builder.add_rule("subsequent_tool_call", string_join(subsequent_tool_rules, " | ")) + " space";
|
||||
@@ -1171,34 +1240,20 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
|
||||
builder.add_rule("root", first_rule);
|
||||
}
|
||||
|
||||
}, grammar_options);
|
||||
});
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
static bool consume(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
|
||||
auto expected_it = expected.begin();
|
||||
auto tmp_it = it;
|
||||
while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
|
||||
++tmp_it;
|
||||
++expected_it;
|
||||
}
|
||||
if (expected_it == expected.end()) {
|
||||
it = tmp_it;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
|
||||
static std::regex function_regex(R"((?:>>>)?(\w+)\n)");
|
||||
static std::regex close_regex(R"($|(?=>>>))");
|
||||
static const std::regex function_regex(R"((?:>>>)?(?:assistant<|end_header_id|>\n)?(\w+)\n)");
|
||||
static const std::regex close_regex(R"($|(?=>>>))");
|
||||
|
||||
std::string content;
|
||||
auto it = input.begin();
|
||||
const auto end = input.end();
|
||||
|
||||
if (consume(it, end, "all\n")) {
|
||||
if (parse_literal(it, end, "all\n")) {
|
||||
std::smatch match;
|
||||
if (std::regex_search(it, end, match, function_regex)) {
|
||||
auto fun_it = match.prefix().second;
|
||||
@@ -1213,7 +1268,7 @@ static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & in
|
||||
}
|
||||
// TODO: tighten & simplify.
|
||||
try {
|
||||
auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
|
||||
auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex, /* allow_raw_python= */ true);
|
||||
res.content = content + res.content;
|
||||
return res;
|
||||
} catch (const std::exception & e) {
|
||||
@@ -1266,12 +1321,13 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
|
||||
});
|
||||
if (has_raw_python) {
|
||||
tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
|
||||
data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
|
||||
data.preserved_tokens.push_back("<|python_tag|>");
|
||||
}
|
||||
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
|
||||
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
|
||||
data.grammar_triggers.push_back({"<function=", /* .at_start = */ false});
|
||||
}, grammar_options);
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function="});
|
||||
});
|
||||
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
// TODO: if (has_raw_python)
|
||||
@@ -1280,7 +1336,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
|
||||
}
|
||||
static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
|
||||
// This version of Functionary still supports the llama 3.1 tool call format for the python tool.
|
||||
static std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
|
||||
static const std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
|
||||
std::smatch match;
|
||||
if (std::regex_search(input, match, python_tag_regex)) {
|
||||
auto code = match[1].str();
|
||||
@@ -1294,8 +1350,8 @@ static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::s
|
||||
});
|
||||
return msg;
|
||||
}
|
||||
static std::regex function_regex(R"(<function=(\w+)>)");
|
||||
static std::regex close_regex(R"(</function>)");
|
||||
static const std::regex function_regex(R"(<function=(\w+)>)");
|
||||
static const std::regex close_regex(R"(</function>)");
|
||||
// TODO: tighten & simplify.
|
||||
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
|
||||
}
|
||||
@@ -1306,6 +1362,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
|
||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
std::vector<std::string> tool_rules;
|
||||
std::vector<std::string> tool_call_alts;
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
std::string name = function.at("name");
|
||||
@@ -1319,68 +1376,187 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
|
||||
}},
|
||||
{"required", json::array({"name", "arguments"})},
|
||||
}));
|
||||
tool_call_alts.push_back(builder.add_rule(
|
||||
name + "-function-tag",
|
||||
"\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
|
||||
builder.add_schema(name + "-args", parameters) + " "
|
||||
"\"</function>\" space"));
|
||||
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
||||
"<function=" + name + ">",
|
||||
});
|
||||
auto escaped_name = regex_escape(name);
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
||||
"<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
|
||||
});
|
||||
});
|
||||
auto tool_call = "\"<tool_call>\" space " + builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " \"</tool_call>\" space";
|
||||
auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
|
||||
std::vector<std::string> alt_tags {
|
||||
any_tool_call,
|
||||
"\"<tool_call>\" space " + any_tool_call + " \"</tool_call>\"",
|
||||
// The rest is just to accommodate common "good bad" outputs.
|
||||
"\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
|
||||
"\"<response>\" space " + any_tool_call + " \"</response>\"",
|
||||
"\"<tools>\" space " + any_tool_call + " \"</tools>\"",
|
||||
"\"<json>\" space " + any_tool_call + " \"</json>\"",
|
||||
"\"<xml>\" space " + any_tool_call + " \"</xml>\"",
|
||||
"\"<JSON>\" space " + any_tool_call + " \"</JSON>\"",
|
||||
};
|
||||
auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
|
||||
tool_call_alts.push_back(wrappable_tool_call);
|
||||
tool_call_alts.push_back(
|
||||
"( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
|
||||
auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
|
||||
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
|
||||
data.grammar_triggers.push_back({"<tool_call>", /* .at_start = */ false});
|
||||
data.preserved_tokens = { "</tool_call>" };
|
||||
}, grammar_options);
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool_call>"});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function"});
|
||||
// Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
|
||||
"(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?\\s*\\{\\s*\"", //name\"\\s*:\\s*\"" + escaped_name + "\"",
|
||||
});
|
||||
data.preserved_tokens = {
|
||||
"<think>",
|
||||
"</think>",
|
||||
"<tool_call>",
|
||||
"</tool_call>",
|
||||
"<function",
|
||||
"<tools>",
|
||||
"</tools>",
|
||||
"<response>",
|
||||
"</response>",
|
||||
"<function_call>",
|
||||
"</function_call>",
|
||||
"<json>",
|
||||
"</json>",
|
||||
"<JSON>",
|
||||
"</JSON>",
|
||||
"```",
|
||||
"```json",
|
||||
"```xml",
|
||||
};
|
||||
});
|
||||
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
|
||||
data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING : COMMON_CHAT_FORMAT_HERMES_2_PRO;
|
||||
return data;
|
||||
}
|
||||
static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) {
|
||||
try {
|
||||
std::regex start_pattern(R"([\n\s]*<tool_call>)");
|
||||
std::regex middle_pattern(R"([\n\s]*</tool_call>[\n\s]*<tool_call>)");
|
||||
std::regex end_pattern(R"([\n\s]*</tool_call>[\n\s]*$)");
|
||||
static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input, bool extract_reasoning) {
|
||||
return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) {
|
||||
static const std::regex open_regex(
|
||||
"(?:"
|
||||
"(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
|
||||
"(<tool_call>" // match 2 (open_tag)
|
||||
"|<function_call>"
|
||||
"|<tool>"
|
||||
"|<tools>"
|
||||
"|<response>"
|
||||
"|<json>"
|
||||
"|<xml>"
|
||||
"|<JSON>"
|
||||
")?"
|
||||
"(\\s*\\{\\s*\"name\"\\s*:[\\s\\S]*)" // match 3 (named tool call + rest)
|
||||
")"
|
||||
"|"
|
||||
"(?:<function=([^>]+)>" // match 4 (function name)
|
||||
"|<function name=\"([^\"]+)\">)" // match 5 (function name again)
|
||||
"([\\s\\S]*)" // match 6 (function arguments + rest)})"
|
||||
);
|
||||
|
||||
common_chat_msg msg;
|
||||
msg.role = "assistant";
|
||||
try {
|
||||
common_chat_msg msg;
|
||||
msg.role = "assistant";
|
||||
|
||||
auto end = input.end();
|
||||
std::sregex_iterator rend;
|
||||
std::sregex_iterator rit(input.begin(), end, start_pattern);
|
||||
if (rit == rend) {
|
||||
std::string::const_iterator it = input.begin();
|
||||
const std::string::const_iterator end = input.end();
|
||||
std::smatch match;
|
||||
|
||||
while (it != end) {
|
||||
if (std::regex_search(it, end, match, open_regex)) {
|
||||
// Add content before the match
|
||||
msg.content += std::string(it, match[0].first);
|
||||
|
||||
auto block_start = match[1].str();
|
||||
std::string block_end = block_start.empty() ? "" : "```";
|
||||
|
||||
auto open_tag = match[2].str();
|
||||
std::string close_tag;
|
||||
|
||||
if (match[3].matched) {
|
||||
close_tag = open_tag.empty() ? "" : "</" + open_tag.substr(1);
|
||||
auto json_it = match[3].first;
|
||||
json tool_call;
|
||||
if (parse_json(json_it, end, tool_call) && tool_call.contains("name") && tool_call.contains("arguments")) {
|
||||
|
||||
msg.tool_calls.emplace_back(process_tool_call(tool_call));
|
||||
it = json_it; // Move iterator past parsed JSON
|
||||
|
||||
// Handle close tags
|
||||
consume_spaces(it, end);
|
||||
if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
|
||||
throw std::runtime_error("Failed to parse closing tag");
|
||||
}
|
||||
consume_spaces(it, end);
|
||||
if (!block_end.empty() && !parse_literal(it, end, block_end)) {
|
||||
throw std::runtime_error("Failed to parse block end");
|
||||
}
|
||||
consume_spaces(it, end);
|
||||
} else {
|
||||
// Not a valid tool call, treat as content
|
||||
msg.content += std::string(match[0].first, match[0].second);
|
||||
it = match[0].second;
|
||||
}
|
||||
} else {
|
||||
auto function_name = match[4].str();
|
||||
if (function_name.empty()) {
|
||||
function_name = match[5].str();
|
||||
}
|
||||
GGML_ASSERT(!function_name.empty());
|
||||
|
||||
close_tag = "</function>";
|
||||
// Start parsing from after the opening tags
|
||||
auto json_it = match[6].first;
|
||||
json arguments;
|
||||
if (parse_json(json_it, end, arguments)) {
|
||||
msg.tool_calls.emplace_back(process_tool_call({
|
||||
{"name", function_name},
|
||||
{"arguments", arguments},
|
||||
}));
|
||||
it = json_it; // Move iterator past parsed JSON
|
||||
|
||||
// Handle close tags
|
||||
consume_spaces(it, end);
|
||||
if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
|
||||
throw std::runtime_error("Failed to parse closing tag");
|
||||
}
|
||||
consume_spaces(it, end);
|
||||
if (!block_end.empty() && !parse_literal(it, end, block_end)) {
|
||||
throw std::runtime_error("Failed to parse block end");
|
||||
}
|
||||
consume_spaces(it, end);
|
||||
} else {
|
||||
// Not a valid tool call, treat as content
|
||||
msg.content += std::string(match[0].first, match[0].second);
|
||||
it = match[0].second;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Add remaining content
|
||||
msg.content += std::string(it, end);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return msg;
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what());
|
||||
common_chat_msg msg;
|
||||
msg.role = "assistant";
|
||||
msg.content = input;
|
||||
return msg;
|
||||
}
|
||||
|
||||
msg.content = rit->prefix();
|
||||
|
||||
auto it = rit->suffix().first;
|
||||
while (it != end) {
|
||||
json call;
|
||||
if (!parse_json(it, end, call)) {
|
||||
throw std::runtime_error("Failed to parse json tool call");
|
||||
}
|
||||
const auto & arguments = call.at("arguments");
|
||||
msg.tool_calls.push_back({
|
||||
call.at("name"),
|
||||
arguments.dump(),
|
||||
// arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
|
||||
/* id= */ "",
|
||||
});
|
||||
rit = {it, end, middle_pattern};
|
||||
if (rit != rend) {
|
||||
it = rit->suffix().first;
|
||||
} else {
|
||||
rit = {it, end, end_pattern};
|
||||
if (rit == rend) {
|
||||
throw std::runtime_error("Malformed input, missing </tool_call>");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return msg;
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what());
|
||||
common_chat_msg msg;
|
||||
msg.role = "assistant";
|
||||
msg.content = input;
|
||||
return msg;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
@@ -1445,6 +1621,11 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||
return common_chat_params_init_command_r7b(tmpl, params);
|
||||
}
|
||||
|
||||
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
|
||||
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
|
||||
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
||||
}
|
||||
|
||||
// Use generic handler when mixing tools + JSON schema.
|
||||
// TODO: support that mix in handlers below.
|
||||
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
||||
@@ -1466,11 +1647,6 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||
return common_chat_params_init_without_tools(tmpl, params);
|
||||
}
|
||||
|
||||
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
|
||||
if (src.find("<tool_call>") != std::string::npos) {
|
||||
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
||||
}
|
||||
|
||||
// Functionary v3.1 (w/ tools)
|
||||
if (src.find("<|start_header_id|>") != std::string::npos
|
||||
&& src.find("<function=") != std::string::npos) {
|
||||
@@ -1588,7 +1764,9 @@ common_chat_msg common_chat_parse(const std::string & input, common_chat_format
|
||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
|
||||
return common_chat_parse_functionary_v3_1_llama_3_1(input);
|
||||
case COMMON_CHAT_FORMAT_HERMES_2_PRO:
|
||||
return common_chat_parse_hermes_2_pro(input);
|
||||
return common_chat_parse_hermes_2_pro(input, /* extract_reasoning= */ false);
|
||||
case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING:
|
||||
return common_chat_parse_hermes_2_pro(input, /* extract_reasoning= */ true);
|
||||
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
|
||||
return common_chat_parse_firefunction_v2(input);
|
||||
case COMMON_CHAT_FORMAT_COMMAND_R7B:
|
||||
|
||||
@@ -53,6 +53,7 @@ enum common_chat_format {
|
||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
||||
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
||||
COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
|
||||
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
||||
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include "json.hpp"
|
||||
#include "json-schema-to-grammar.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
@@ -483,6 +482,11 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
||||
s = std::move(builder);
|
||||
}
|
||||
|
||||
std::string regex_escape(const std::string & s) {
|
||||
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
||||
return std::regex_replace(s, special_chars, "\\$0");
|
||||
}
|
||||
|
||||
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
|
||||
std::ostringstream result;
|
||||
for (size_t i = 0; i < values.size(); ++i) {
|
||||
@@ -951,8 +955,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
return iparams;
|
||||
}
|
||||
|
||||
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
|
||||
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
|
||||
if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
|
||||
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
||||
params.ctx_shift = false;
|
||||
}
|
||||
|
||||
@@ -1029,6 +1033,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
if (params.warmup) {
|
||||
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||
|
||||
llama_set_warmup(lctx, true);
|
||||
|
||||
std::vector<llama_token> tmp;
|
||||
llama_token bos = llama_vocab_bos(vocab);
|
||||
llama_token eos = llama_vocab_eos(vocab);
|
||||
@@ -1056,9 +1062,10 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
if (llama_model_has_decoder(model)) {
|
||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
||||
}
|
||||
llama_kv_cache_clear(lctx);
|
||||
llama_kv_self_clear(lctx);
|
||||
llama_synchronize(lctx);
|
||||
llama_perf_context_reset(lctx);
|
||||
llama_set_warmup(lctx, false);
|
||||
}
|
||||
|
||||
iparams.model.reset(model);
|
||||
@@ -2026,3 +2033,25 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
||||
return result;
|
||||
}
|
||||
|
||||
template <>
|
||||
json common_grammar_trigger::to_json() const {
|
||||
json out {
|
||||
{"type", (int) type},
|
||||
{"value", value},
|
||||
};
|
||||
if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
||||
out["token"] = (int) token;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <>
|
||||
common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
|
||||
common_grammar_trigger out;
|
||||
out.type = (common_grammar_trigger_type) in.at("type").get<int>();
|
||||
out.value = in.at("value").get<std::string>();
|
||||
if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
||||
out.token = (llama_token) in.at("token").get<int>();
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
@@ -110,9 +110,21 @@ enum common_conversation_mode {
|
||||
COMMON_CONVERSATION_MODE_AUTO = 2,
|
||||
};
|
||||
|
||||
enum common_grammar_trigger_type {
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
|
||||
};
|
||||
|
||||
struct common_grammar_trigger {
|
||||
std::string word;
|
||||
bool at_start;
|
||||
common_grammar_trigger_type type;
|
||||
std::string value;
|
||||
llama_token token = LLAMA_TOKEN_NULL;
|
||||
|
||||
// T can only be nlohmann::ordered_json
|
||||
template <class T> T to_json() const;
|
||||
template <class T> static common_grammar_trigger from_json(const T & in);
|
||||
};
|
||||
|
||||
// sampling parameters
|
||||
@@ -163,8 +175,7 @@ struct common_params_sampling {
|
||||
|
||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||
bool grammar_lazy = false;
|
||||
std::vector<common_grammar_trigger> grammar_trigger_words; // optional trigger words to trigger lazy grammar
|
||||
std::vector<llama_token> grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
|
||||
std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
|
||||
std::set<llama_token> preserved_tokens;
|
||||
|
||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||
@@ -328,6 +339,8 @@ struct common_params {
|
||||
bool warmup = true; // warmup run
|
||||
bool check_tensors = false; // validate tensor data
|
||||
|
||||
bool single_turn = false; // single turn chat conversation
|
||||
|
||||
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
||||
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
||||
|
||||
@@ -394,8 +407,6 @@ struct common_params {
|
||||
int32_t i_pos = -1; // position of the passkey in the junk text
|
||||
|
||||
// imatrix params
|
||||
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
|
||||
|
||||
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
||||
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
||||
int32_t i_chunk = 0; // start processing from this chunk
|
||||
@@ -407,16 +418,16 @@ struct common_params {
|
||||
int n_pca_batch = 100;
|
||||
int n_pca_iterations = 1000;
|
||||
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||
std::string cvector_outfile = "control_vector.gguf";
|
||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||
|
||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||
|
||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||
|
||||
// batched-bench params
|
||||
bool batched_bench_output_jsonl = false;
|
||||
|
||||
// common params
|
||||
std::string out_file; // output filename for all example programs
|
||||
};
|
||||
|
||||
// call once at the start of a program if it uses libcommon
|
||||
@@ -456,6 +467,8 @@ std::string string_repeat(const std::string & str, size_t n);
|
||||
|
||||
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
|
||||
|
||||
std::string regex_escape(const std::string & s);
|
||||
|
||||
template<class T>
|
||||
static std::vector<T> string_split(const std::string & str, char delim) {
|
||||
static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
|
||||
|
||||
@@ -264,7 +264,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
||||
throw std::runtime_error("At least one of min_value or max_value must be set");
|
||||
}
|
||||
|
||||
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
|
||||
const std::string SPACE_RULE = "| \" \" | \"\\n\"{1,2} [ \\t]{0,20}";
|
||||
|
||||
struct BuiltinRule {
|
||||
std::string content;
|
||||
@@ -764,11 +764,10 @@ private:
|
||||
public:
|
||||
SchemaConverter(
|
||||
const std::function<json(const std::string &)> & fetch_json,
|
||||
bool dotall,
|
||||
bool compact_spaces)
|
||||
bool dotall)
|
||||
: _fetch_json(fetch_json), _dotall(dotall)
|
||||
{
|
||||
_rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
|
||||
_rules["space"] = SPACE_RULE;
|
||||
}
|
||||
|
||||
void resolve_refs(json & schema, const std::string & url) {
|
||||
@@ -1007,7 +1006,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
||||
}
|
||||
|
||||
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
|
||||
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
|
||||
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
|
||||
common_grammar_builder builder {
|
||||
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
||||
return converter._add_rule(name, rule);
|
||||
|
||||
@@ -16,7 +16,6 @@ struct common_grammar_builder {
|
||||
|
||||
struct common_grammar_options {
|
||||
bool dotall = false;
|
||||
bool compact_spaces = false;
|
||||
};
|
||||
|
||||
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
|
||||
|
||||
@@ -1378,13 +1378,27 @@ struct ArgumentsExpression {
|
||||
}
|
||||
};
|
||||
|
||||
static std::string strip(const std::string & s) {
|
||||
auto start = s.find_first_not_of(" \t\n\r");
|
||||
static std::string strip(const std::string & s, const std::string & chars = "", bool left = true, bool right = true) {
|
||||
auto charset = chars.empty() ? " \t\n\r" : chars;
|
||||
auto start = left ? s.find_first_not_of(charset) : 0;
|
||||
if (start == std::string::npos) return "";
|
||||
auto end = s.find_last_not_of(" \t\n\r");
|
||||
auto end = right ? s.find_last_not_of(charset) : s.size() - 1;
|
||||
return s.substr(start, end - start + 1);
|
||||
}
|
||||
|
||||
static std::vector<std::string> split(const std::string & s, const std::string & sep) {
|
||||
std::vector<std::string> result;
|
||||
size_t start = 0;
|
||||
size_t end = s.find(sep);
|
||||
while (end != std::string::npos) {
|
||||
result.push_back(s.substr(start, end - start));
|
||||
start = end + sep.length();
|
||||
end = s.find(sep, start);
|
||||
}
|
||||
result.push_back(s.substr(start));
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string capitalize(const std::string & s) {
|
||||
if (s.empty()) return s;
|
||||
auto result = s;
|
||||
@@ -1467,8 +1481,26 @@ public:
|
||||
} else if (obj.is_string()) {
|
||||
auto str = obj.get<std::string>();
|
||||
if (method->get_name() == "strip") {
|
||||
vargs.expectArgs("strip method", {0, 0}, {0, 0});
|
||||
return Value(strip(str));
|
||||
vargs.expectArgs("strip method", {0, 1}, {0, 0});
|
||||
auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
|
||||
return Value(strip(str, chars));
|
||||
} else if (method->get_name() == "lstrip") {
|
||||
vargs.expectArgs("lstrip method", {0, 1}, {0, 0});
|
||||
auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
|
||||
return Value(strip(str, chars, /* left= */ true, /* right= */ false));
|
||||
} else if (method->get_name() == "rstrip") {
|
||||
vargs.expectArgs("rstrip method", {0, 1}, {0, 0});
|
||||
auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
|
||||
return Value(strip(str, chars, /* left= */ false, /* right= */ true));
|
||||
} else if (method->get_name() == "split") {
|
||||
vargs.expectArgs("split method", {1, 1}, {0, 0});
|
||||
auto sep = vargs.args[0].get<std::string>();
|
||||
auto parts = split(str, sep);
|
||||
Value result = Value::array();
|
||||
for (const auto& part : parts) {
|
||||
result.push_back(Value(part));
|
||||
}
|
||||
return result;
|
||||
} else if (method->get_name() == "capitalize") {
|
||||
vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
|
||||
return Value(capitalize(str));
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <thread>
|
||||
#include <algorithm>
|
||||
|
||||
void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
||||
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
#include <cmath>
|
||||
#include <unordered_map>
|
||||
#include <algorithm>
|
||||
|
||||
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
||||
// TODO: deduplicate with llama-impl.h
|
||||
@@ -159,16 +160,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
||||
#endif // LLAMA_USE_LLGUIDANCE
|
||||
} else {
|
||||
std::vector<const char *> trigger_words;
|
||||
trigger_words.reserve(params.grammar_trigger_words.size());
|
||||
for (const auto & str : params.grammar_trigger_words) {
|
||||
trigger_words.push_back(str.word.c_str());
|
||||
std::vector<std::string> patterns_at_start;
|
||||
std::vector<std::string> patterns_anywhere;
|
||||
std::vector<llama_token> trigger_tokens;
|
||||
for (const auto & trigger : params.grammar_triggers) {
|
||||
switch (trigger.type) {
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
|
||||
{
|
||||
const auto & word = trigger.value;
|
||||
patterns_anywhere.push_back(regex_escape(word));
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
|
||||
{
|
||||
const auto & pattern = trigger.value;
|
||||
(trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
|
||||
{
|
||||
const auto token = trigger.token;
|
||||
trigger_tokens.push_back(token);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown trigger type");
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> trigger_patterns;
|
||||
if (!patterns_at_start.empty()) {
|
||||
trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
|
||||
}
|
||||
if (!patterns_anywhere.empty()) {
|
||||
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
|
||||
}
|
||||
|
||||
std::vector<const char *> trigger_patterns_c;
|
||||
trigger_patterns_c.reserve(trigger_patterns.size());
|
||||
for (const auto & regex : trigger_patterns) {
|
||||
trigger_patterns_c.push_back(regex.c_str());
|
||||
}
|
||||
|
||||
grmr = params.grammar_lazy
|
||||
? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
|
||||
trigger_words.data(), trigger_words.size(),
|
||||
params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
|
||||
? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
||||
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
trigger_tokens.data(), trigger_tokens.size())
|
||||
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "sampling.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
@@ -172,7 +173,7 @@ llama_tokens common_speculative_gen_draft(
|
||||
result.reserve(params.n_draft);
|
||||
|
||||
if (reuse_n == 0) {
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
prompt.clear();
|
||||
} else {
|
||||
@@ -191,14 +192,14 @@ llama_tokens common_speculative_gen_draft(
|
||||
}
|
||||
|
||||
if (reuse_i > 0) {
|
||||
llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
|
||||
llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
|
||||
llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
|
||||
llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
|
||||
|
||||
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
|
||||
}
|
||||
|
||||
if (reuse_n < (int) prompt.size()) {
|
||||
llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
|
||||
llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
|
||||
|
||||
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
||||
}
|
||||
|
||||
@@ -861,6 +861,9 @@ class Model:
|
||||
for token_id, token_data in added_tokens_decoder.items():
|
||||
token_id = int(token_id)
|
||||
token: str = token_data["content"]
|
||||
if token_id >= vocab_size:
|
||||
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||
continue
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||
if tokens[token_id] != token.encode("utf-8"):
|
||||
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
|
||||
@@ -3322,6 +3325,83 @@ class Gemma2Model(Model):
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
|
||||
class Gemma3Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA3
|
||||
has_vision: bool = False
|
||||
|
||||
# we need to merge the text_config into the root level of hparams
|
||||
def __init__(self, *args, **kwargs):
|
||||
hparams = Model.load_hparams(kwargs["dir_model"])
|
||||
if "text_config" in hparams:
|
||||
hparams = {**hparams, **hparams["text_config"]}
|
||||
kwargs["hparams"] = hparams
|
||||
super().__init__(*args, **kwargs)
|
||||
if "vision_config" in hparams:
|
||||
logger.info("Has vision encoder, but it will be ignored")
|
||||
self.has_vision = True
|
||||
|
||||
def write(self):
|
||||
super().write()
|
||||
if self.has_vision:
|
||||
logger.info("NOTE: this script only convert the language model to GGUF")
|
||||
logger.info(" for the vision model, please use gemma3_convert_encoder_to_gguf.py")
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
|
||||
self.gguf_writer.add_add_space_prefix(False)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
||||
# some default values are not specified in the hparams
|
||||
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
|
||||
self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
|
||||
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
|
||||
# both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
|
||||
assert hparams.get("attn_logit_softcapping") is None
|
||||
assert hparams.get("final_logit_softcapping") is None
|
||||
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
||||
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
|
||||
if hparams.get("rope_scaling") is not None:
|
||||
assert hparams["rope_scaling"]["rope_type"] == "linear"
|
||||
# important: this rope_scaling is only applied for global layers, and not used by 1B model
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
if name.startswith("language_model."):
|
||||
name = name.replace("language_model.", "")
|
||||
elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
|
||||
or name.startswith("multimodal_projector.") or name.startswith("vision_model."): # this is for old HF model, should be removed later
|
||||
# ignore vision tensors
|
||||
return []
|
||||
|
||||
# remove OOV (out-of-vocabulary) rows in token_embd
|
||||
if "embed_tokens.weight" in name:
|
||||
vocab = self._create_vocab_sentencepiece()
|
||||
tokens = vocab[0]
|
||||
data_torch = data_torch[:len(tokens)]
|
||||
|
||||
# ref code in Gemma3RMSNorm
|
||||
# output = output * (1.0 + self.weight.float())
|
||||
if name.endswith("norm.weight"):
|
||||
data_torch = data_torch + 1
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@Model.register("Starcoder2ForCausalLM")
|
||||
class StarCoder2Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.STARCODER2
|
||||
|
||||
@@ -197,29 +197,53 @@ The following compilation options are also available to tweak performance:
|
||||
|
||||
## MUSA
|
||||
|
||||
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
|
||||
This provides GPU acceleration using a Moore Threads GPU. Make sure to have the [MUSA SDK](https://developer.mthreads.com/musa/musa-sdk) installed.
|
||||
|
||||
- Using `CMake`:
|
||||
#### Download directly from Moore Threads
|
||||
|
||||
```bash
|
||||
cmake -B build -DGGML_MUSA=ON
|
||||
cmake --build build --config Release
|
||||
You may find the official downloads here: [Moore Threads developer site](https://developer.mthreads.com/sdk/download/musa).
|
||||
|
||||
### Compilation
|
||||
|
||||
```bash
|
||||
cmake -B build -DGGML_MUSA=ON
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
#### Override Compute Capability Specifications
|
||||
|
||||
By default, all supported compute capabilities are enabled. To customize this behavior, you can specify the `MUSA_ARCHITECTURES` option in the CMake command:
|
||||
|
||||
```bash
|
||||
cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
|
||||
```
|
||||
|
||||
This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
|
||||
|
||||
#### Compilation options
|
||||
|
||||
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
|
||||
|
||||
- For static builds, add `-DBUILD_SHARED_LIBS=OFF` and `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`:
|
||||
```
|
||||
|
||||
For static build:
|
||||
|
||||
```bash
|
||||
cmake -B build -DGGML_MUSA=ON \
|
||||
-DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
|
||||
### Runtime MUSA environmental variables
|
||||
|
||||
You may set the [musa environmental variables](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) at runtime.
|
||||
|
||||
```bash
|
||||
# Use `MUSA_VISIBLE_DEVICES` to hide the first compute device.
|
||||
MUSA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
|
||||
```
|
||||
|
||||
### Unified Memory
|
||||
|
||||
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
|
||||
|
||||
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
|
||||
|
||||
## HIP
|
||||
|
||||
This provides GPU acceleration on HIP-supported AMD GPUs.
|
||||
@@ -235,6 +259,12 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
||||
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
||||
|
||||
To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
|
||||
|
||||
The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager.
|
||||
|
||||
As an alternative, you can manually install the library by cloning it from the official [GitHub repository](https://github.com/ROCm/rocWMMA), checkout the corresponding version tag (e.g. `rocm-6.2.4`) and set `-DCMAKE_CXX_FLAGS="-I<path/to/rocwmma>/library/include/"` in CMake. This also works under Windows despite not officially supported by AMD.
|
||||
|
||||
Note that if you get the following error:
|
||||
```
|
||||
clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
|
||||
|
||||
@@ -287,30 +287,32 @@ Here are some models known to work (w/ chat template override when needed):
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
|
||||
llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
|
||||
llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
|
||||
llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
|
||||
|
||||
# Native support for DeepSeek R1 works best w/ our own template (official template buggy)
|
||||
# Native support for DeepSeek R1 works best w/ our template override (official template is buggy, although we do work around it)
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \
|
||||
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
|
||||
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \
|
||||
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
|
||||
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
|
||||
|
||||
# Native support requires the right template for these GGUFs:
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
|
||||
--chat-template-file models/templates/meetkai-functionary-medium-v3.2.jinja
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
|
||||
--chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use )
|
||||
--chat-template-file models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \
|
||||
--chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use )
|
||||
--chat-template-file models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
|
||||
--chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use )
|
||||
--chat-template-file models/templates/fireworks-ai-llama-3-firefunction-v2.jinja
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \
|
||||
--chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use )
|
||||
--chat-template-file models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja
|
||||
|
||||
# Generic format support
|
||||
llama-server --jinja -fa -hf bartowski/phi-4-GGUF:Q4_0
|
||||
@@ -318,6 +320,8 @@ llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q8_0
|
||||
llama-server --jinja -fa -hf bartowski/c4ai-command-r-v01-GGUF:Q2_K
|
||||
```
|
||||
|
||||
To get the official template from original HuggingFace repos, you can use [scripts/get_chat_template.py](../scripts/get_chat_template.py) (see examples invocations in [models/templates/README.md](../models/templates/README.md))
|
||||
|
||||
> [!TIP]
|
||||
> If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)
|
||||
|
||||
|
||||
@@ -132,7 +132,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const auto t_pp_start = ggml_time_us();
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
@@ -141,7 +141,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (is_pp_shared) {
|
||||
for (int32_t i = 1; i < pl; ++i) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 {
|
||||
}
|
||||
|
||||
for i in 1 ..< n_parallel {
|
||||
llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
|
||||
llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
|
||||
}
|
||||
|
||||
if n_parallel > 1 {
|
||||
|
||||
@@ -342,7 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
}
|
||||
|
||||
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
@@ -394,6 +394,8 @@ static int prepare_entries(common_params & params, train_context & ctx_train) {
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.out_file = "control_vector.gguf";
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
@@ -498,7 +500,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// write output vectors to gguf
|
||||
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
|
||||
export_gguf(ctx_train.v_final, params.out_file, model_hint);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include "llama.h"
|
||||
|
||||
#include <ctime>
|
||||
#include <algorithm>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
@@ -37,7 +38,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||
const struct llama_model * model = llama_get_model(ctx);
|
||||
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
// run model
|
||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
|
||||
@@ -413,20 +413,22 @@ static void print_usage(int, char ** argv) {
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.out_file = "ggml-lora-merged-f16.gguf";
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
g_verbose = (params.verbosity > 1);
|
||||
try {
|
||||
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
|
||||
lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
|
||||
ctx.run_merge();
|
||||
} catch (const std::exception & err) {
|
||||
fprintf(stderr, "%s\n", err.what());
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
printf("done, output file is %s\n", params.lora_outfile.c_str());
|
||||
printf("done, output file is %s\n", params.out_file.c_str());
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -45,7 +45,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||
}
|
||||
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
llama_set_embeddings(ctx, true);
|
||||
llama_set_causal_attn(ctx, false);
|
||||
|
||||
@@ -102,7 +102,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
||||
|
||||
llama_token eos_token = llama_vocab_eos(vocab);
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
llama_set_embeddings(ctx, false);
|
||||
llama_set_causal_attn(ctx, true);
|
||||
|
||||
|
||||
@@ -206,9 +206,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
|
||||
void IMatrixCollector::save_imatrix(int ncall) const {
|
||||
auto fname = m_params.out_file;
|
||||
if (fname.empty()) {
|
||||
fname = "imatrix.dat";
|
||||
}
|
||||
|
||||
if (ncall > 0) {
|
||||
fname += ".at_";
|
||||
@@ -498,7 +495,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
||||
|
||||
@@ -583,6 +580,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.out_file = "imatrix.dat" ;
|
||||
|
||||
params.n_ctx = 512;
|
||||
params.logits_all = true;
|
||||
params.escape = false;
|
||||
|
||||
@@ -332,8 +332,8 @@ int main(int argc, char ** argv) {
|
||||
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
|
||||
@@ -195,7 +195,7 @@ class BuiltinRule:
|
||||
self.deps = deps or []
|
||||
|
||||
# Constraining spaces to prevent model "running away".
|
||||
SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
|
||||
SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}'
|
||||
|
||||
PRIMITIVE_RULES = {
|
||||
'boolean' : BuiltinRule('("true" | "false") space', []),
|
||||
|
||||
@@ -1578,7 +1578,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
test t(inst, lmodel, ctx);
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
// cool off before the test
|
||||
if (params.delay) {
|
||||
@@ -1618,7 +1618,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
for (int i = 0; i < params.reps; i++) {
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
uint64_t t_start = get_time_ns();
|
||||
|
||||
|
||||
@@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||
}
|
||||
|
||||
batch->logits[batch->n_tokens - 1] = true;
|
||||
llama_kv_cache_clear(context);
|
||||
llama_kv_self_clear(context);
|
||||
|
||||
const auto t_pp_start = ggml_time_us();
|
||||
if (llama_decode(context, *batch) != 0) {
|
||||
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||
|
||||
LOGi("Benchmark text generation (tg)");
|
||||
|
||||
llama_kv_cache_clear(context);
|
||||
llama_kv_self_clear(context);
|
||||
const auto t_tg_start = ggml_time_us();
|
||||
for (i = 0; i < tg; i++) {
|
||||
|
||||
@@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||
|
||||
const auto t_tg_end = ggml_time_us();
|
||||
|
||||
llama_kv_cache_clear(context);
|
||||
llama_kv_self_clear(context);
|
||||
|
||||
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
|
||||
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
|
||||
@@ -361,7 +361,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
||||
const auto tokens_list = common_tokenize(context, text, true, parse_special);
|
||||
|
||||
auto n_ctx = llama_n_ctx(context);
|
||||
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
|
||||
auto n_kv_req = tokens_list.size() + n_len;
|
||||
|
||||
LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
|
||||
|
||||
@@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
|
||||
llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
|
||||
}
|
||||
|
||||
@@ -5,6 +5,21 @@ point for more advanced projects.
|
||||
|
||||
For usage instructions and performance stats, check the following discussion: https://github.com/ggml-org/llama.cpp/discussions/4508
|
||||
|
||||
|
||||
### Building
|
||||
First llama.cpp need to be built and a XCFramework needs to be created. This can be done by running
|
||||
the following script from the llama.cpp project root:
|
||||
```console
|
||||
$ ./build-xcframework.sh
|
||||
```
|
||||
Open `llama.swiftui.xcodeproj` project in Xcode and you should be able to build and run the app on
|
||||
a simulator or a real device.
|
||||
|
||||
To use the framework with a different project, the XCFramework can be added to the project by
|
||||
adding `build-apple/llama.xcframework` by dragging and dropping it into the project navigator, or
|
||||
by manually selecting the framework in the "Frameworks, Libraries, and Embedded Content" section
|
||||
of the project settings.
|
||||
|
||||

|
||||
|
||||
Video demonstration:
|
||||
|
||||
@@ -210,7 +210,7 @@ actor LlamaContext {
|
||||
}
|
||||
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
|
||||
|
||||
llama_kv_cache_clear(context)
|
||||
llama_kv_self_clear(context)
|
||||
|
||||
let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||
|
||||
@@ -223,7 +223,7 @@ actor LlamaContext {
|
||||
|
||||
// bench text generation
|
||||
|
||||
llama_kv_cache_clear(context)
|
||||
llama_kv_self_clear(context)
|
||||
|
||||
let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||
|
||||
@@ -242,7 +242,7 @@ actor LlamaContext {
|
||||
|
||||
let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||
|
||||
llama_kv_cache_clear(context)
|
||||
llama_kv_self_clear(context)
|
||||
|
||||
let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
|
||||
let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
|
||||
@@ -292,7 +292,7 @@ actor LlamaContext {
|
||||
func clear() {
|
||||
tokens_list.removeAll()
|
||||
temporary_invalid_cchars.removeAll()
|
||||
llama_kv_cache_clear(context)
|
||||
llama_kv_self_clear(context)
|
||||
}
|
||||
|
||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
objects = {
|
||||
|
||||
/* Begin PBXBuildFile section */
|
||||
1809696D2D05A39F00400EE8 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = 1809696C2D05A39F00400EE8 /* llama */; };
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
|
||||
79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
|
||||
7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
|
||||
@@ -18,9 +17,25 @@
|
||||
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
|
||||
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
|
||||
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
|
||||
DD84C9FD2D747FED007778EC /* llama.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = DD84C9FC2D747FED007778EC /* llama.xcframework */; };
|
||||
DD84C9FE2D747FED007778EC /* llama.xcframework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = DD84C9FC2D747FED007778EC /* llama.xcframework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
|
||||
F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
|
||||
/* End PBXBuildFile section */
|
||||
|
||||
/* Begin PBXCopyFilesBuildPhase section */
|
||||
DD84C9FF2D747FED007778EC /* Embed Frameworks */ = {
|
||||
isa = PBXCopyFilesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
dstPath = "";
|
||||
dstSubfolderSpec = 10;
|
||||
files = (
|
||||
DD84C9FE2D747FED007778EC /* llama.xcframework in Embed Frameworks */,
|
||||
);
|
||||
name = "Embed Frameworks";
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXCopyFilesBuildPhase section */
|
||||
|
||||
/* Begin PBXFileReference section */
|
||||
549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
|
||||
79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputButton.swift; sourceTree = "<group>"; };
|
||||
@@ -33,6 +48,7 @@
|
||||
8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
|
||||
8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
|
||||
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
|
||||
DD84C9FC2D747FED007778EC /* llama.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = llama.xcframework; path = "../../build-apple/llama.xcframework"; sourceTree = "<group>"; };
|
||||
DF2D2FE72B4A59BE00FCB72D /* llama.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = llama.cpp; path = ../..; sourceTree = "<group>"; };
|
||||
F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LoadCustomButton.swift; sourceTree = "<group>"; };
|
||||
/* End PBXFileReference section */
|
||||
@@ -42,9 +58,9 @@
|
||||
isa = PBXFrameworksBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
1809696D2D05A39F00400EE8 /* llama in Frameworks */,
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
|
||||
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
|
||||
DD84C9FD2D747FED007778EC /* llama.xcframework in Frameworks */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
@@ -86,6 +102,7 @@
|
||||
8A39BE082AC7601000BFEB40 /* Frameworks */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
DD84C9FC2D747FED007778EC /* llama.xcframework */,
|
||||
549479CA2AC9E16000E0F78B /* Metal.framework */,
|
||||
8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
|
||||
);
|
||||
@@ -144,6 +161,7 @@
|
||||
8A1C836F2AC328BD0096AF73 /* Sources */,
|
||||
8A1C83702AC328BD0096AF73 /* Frameworks */,
|
||||
8A1C83712AC328BD0096AF73 /* Resources */,
|
||||
DD84C9FF2D747FED007778EC /* Embed Frameworks */,
|
||||
);
|
||||
buildRules = (
|
||||
);
|
||||
@@ -151,7 +169,6 @@
|
||||
);
|
||||
name = llama.swiftui;
|
||||
packageProductDependencies = (
|
||||
1809696C2D05A39F00400EE8 /* llama */,
|
||||
);
|
||||
productName = llama.swiftui;
|
||||
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
|
||||
@@ -427,13 +444,6 @@
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
/* End XCConfigurationList section */
|
||||
|
||||
/* Begin XCSwiftPackageProductDependency section */
|
||||
1809696C2D05A39F00400EE8 /* llama */ = {
|
||||
isa = XCSwiftPackageProductDependency;
|
||||
productName = llama;
|
||||
};
|
||||
/* End XCSwiftPackageProductDependency section */
|
||||
};
|
||||
rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
|
||||
}
|
||||
|
||||
@@ -51,6 +51,13 @@ install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-gemma3-cli)
|
||||
add_executable(${TARGET} gemma3-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-llava-clip-quantize-cli)
|
||||
add_executable(${TARGET} clip-quantize-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)
|
||||
|
||||
30
examples/llava/README-gemma3.md
Normal file
30
examples/llava/README-gemma3.md
Normal file
@@ -0,0 +1,30 @@
|
||||
# Gemma 3 vision
|
||||
|
||||
> [!IMPORTANT]
|
||||
>
|
||||
> This is very experimental, only used for demo purpose.
|
||||
|
||||
## How to get mmproj.gguf?
|
||||
|
||||
```bash
|
||||
cd gemma-3-4b-it
|
||||
python ../llama.cpp/examples/llava/gemma3_convert_encoder_to_gguf.py .
|
||||
|
||||
# output file is mmproj.gguf
|
||||
```
|
||||
|
||||
## How to run it?
|
||||
|
||||
What you need:
|
||||
- The text model GGUF, can be converted using `convert_hf_to_gguf.py`
|
||||
- The mmproj file from step above
|
||||
- An image file
|
||||
|
||||
```bash
|
||||
# build
|
||||
cmake -B build
|
||||
cmake --build build --target llama-gemma3-cli
|
||||
|
||||
# run it
|
||||
./build/bin/llama-gemma3-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
|
||||
```
|
||||
@@ -5,13 +5,25 @@ Currently, this readme only supports minicpm-omni's image capabilities, and we w
|
||||
|
||||
Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder.
|
||||
|
||||
|
||||
### Build llama.cpp
|
||||
Readme modification time: 20250206
|
||||
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone git@github.com:OpenBMB/llama.cpp.git
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
git checkout minicpm-omni
|
||||
```
|
||||
|
||||
Build llama.cpp using `CMake`:
|
||||
```bash
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
|
||||
### Usage of MiniCPM-o 2.6
|
||||
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
|
||||
@@ -22,25 +34,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
./llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
./build/bin/llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
```
|
||||
|
||||
Build llama.cpp using `CMake`:
|
||||
https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md
|
||||
|
||||
```bash
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
Inference on Linux or Mac
|
||||
```
|
||||
```bash
|
||||
# run f16 version
|
||||
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# run quantized int4 version
|
||||
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# or run in interactive mode
|
||||
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
|
||||
./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
```
|
||||
|
||||
@@ -4,13 +4,26 @@
|
||||
|
||||
Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
|
||||
|
||||
|
||||
### Build llama.cpp
|
||||
Readme modification time: 20250206
|
||||
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggml-org/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
### Usage
|
||||
Build llama.cpp using `CMake`:
|
||||
```bash
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
|
||||
### Usage of MiniCPM-Llama3-V 2.5
|
||||
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
||||
|
||||
@@ -20,80 +33,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
||||
|
||||
# quantize int4 version
|
||||
./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
./build/bin/llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
```
|
||||
|
||||
Build for Linux or Mac
|
||||
|
||||
```bash
|
||||
make
|
||||
make llama-minicpmv-cli
|
||||
```
|
||||
|
||||
Inference on Linux or Mac
|
||||
```
|
||||
```bash
|
||||
# run f16 version
|
||||
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# run quantized int4 version
|
||||
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# or run in interactive mode
|
||||
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
|
||||
```
|
||||
|
||||
### Android
|
||||
|
||||
#### Build on Android device using Termux
|
||||
We found that build on Android device would bring better runtime performance, so we recommend to build on device.
|
||||
|
||||
[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
|
||||
|
||||
Install tools in Termux:
|
||||
```
|
||||
apt update && apt upgrade -y
|
||||
apt install git make cmake
|
||||
```
|
||||
|
||||
It's recommended to move your model inside the `~/` directory for best performance:
|
||||
```
|
||||
cd storage/downloads
|
||||
mv model.gguf ~/
|
||||
```
|
||||
|
||||
#### Building the Project using Android NDK
|
||||
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
|
||||
|
||||
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
|
||||
|
||||
```bash
|
||||
mkdir build-android
|
||||
cd build-android
|
||||
export NDK=/your_ndk_path
|
||||
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
|
||||
make
|
||||
```
|
||||
|
||||
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
|
||||
|
||||
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
|
||||
|
||||
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
|
||||
```
|
||||
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
|
||||
$cd /data/data/com.termux/files/home/bin
|
||||
$chmod +x ./*
|
||||
```
|
||||
|
||||
Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
|
||||
|
||||
```
|
||||
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
|
||||
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
|
||||
```
|
||||
|
||||
Now, you can start chatting:
|
||||
```
|
||||
$cd /data/data/com.termux/files/home/bin
|
||||
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
```
|
||||
|
||||
@@ -4,13 +4,25 @@
|
||||
|
||||
Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
|
||||
|
||||
|
||||
### Build llama.cpp
|
||||
Readme modification time: 20250206
|
||||
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone git@github.com:OpenBMB/llama.cpp.git
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
git checkout minicpmv-main
|
||||
```
|
||||
|
||||
Build llama.cpp using `CMake`:
|
||||
```bash
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
|
||||
### Usage of MiniCPM-V 2.6
|
||||
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
|
||||
@@ -21,87 +33,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
./build/bin/llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
```
|
||||
|
||||
Build for Linux or Mac
|
||||
|
||||
```bash
|
||||
make
|
||||
make llama-minicpmv-cli
|
||||
```
|
||||
|
||||
Inference on Linux or Mac
|
||||
```
|
||||
```bash
|
||||
# run f16 version
|
||||
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# run quantized int4 version
|
||||
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# or run in interactive mode
|
||||
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
|
||||
```
|
||||
|
||||
### Video
|
||||
Install FFmpeg
|
||||
```
|
||||
brew install ffmpeg
|
||||
brew install pkg-config
|
||||
```
|
||||
|
||||
### Android
|
||||
|
||||
#### Build on Android device using Termux
|
||||
We found that build on Android device would bring better runtime performance, so we recommend to build on device.
|
||||
|
||||
[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
|
||||
|
||||
Install tools in Termux:
|
||||
```
|
||||
apt update && apt upgrade -y
|
||||
apt install git make cmake
|
||||
```
|
||||
|
||||
It's recommended to move your model inside the `~/` directory for best performance:
|
||||
```
|
||||
cd storage/downloads
|
||||
mv model.gguf ~/
|
||||
```
|
||||
|
||||
#### Building the Project using Android NDK
|
||||
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
|
||||
|
||||
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
|
||||
|
||||
```bash
|
||||
mkdir build-android
|
||||
cd build-android
|
||||
export NDK=/your_ndk_path
|
||||
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
|
||||
make
|
||||
```
|
||||
|
||||
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
|
||||
|
||||
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
|
||||
|
||||
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
|
||||
```
|
||||
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
|
||||
$cd /data/data/com.termux/files/home/bin
|
||||
$chmod +x ./*
|
||||
```
|
||||
|
||||
Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
|
||||
|
||||
```
|
||||
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
|
||||
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
|
||||
```
|
||||
|
||||
Now, you can start chatting:
|
||||
```
|
||||
$cd /data/data/com.termux/files/home/bin
|
||||
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
```
|
||||
|
||||
@@ -4,31 +4,12 @@
|
||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||
#include "clip.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpp.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "gguf.h"
|
||||
|
||||
//#ifdef GGML_USE_CUDA
|
||||
//#include "ggml-cuda.h"
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_SYCL
|
||||
//#include "ggml-sycl.h"
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_METAL
|
||||
//#include "ggml-metal.h"
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_CANN
|
||||
//#include "ggml-cann.h"
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_VULKAN
|
||||
//#include "ggml-vulkan.h"
|
||||
//#endif
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb_image.h"
|
||||
|
||||
@@ -155,6 +136,8 @@ static std::string format(const char * fmt, ...) {
|
||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||
|
||||
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
||||
#define TN_MINICPMV_QUERY "resampler.query"
|
||||
@@ -181,6 +164,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_RESAMPLER,
|
||||
PROJECTOR_TYPE_GLM_EDGE,
|
||||
PROJECTOR_TYPE_MERGER,
|
||||
PROJECTOR_TYPE_GEMMA3,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -191,6 +175,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
|
||||
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
|
||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||
};
|
||||
|
||||
|
||||
@@ -317,7 +302,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
|
||||
return kv.first;
|
||||
}
|
||||
}
|
||||
return PROJECTOR_TYPE_UNKNOWN;
|
||||
throw std::runtime_error(format("Unknown projector type: %s", name.c_str()));
|
||||
}
|
||||
|
||||
#ifdef CLIP_DEBUG_FUNCTIONS
|
||||
@@ -574,6 +559,10 @@ struct clip_vision_model {
|
||||
struct ggml_tensor * mm_model_ln_kv_b;
|
||||
struct ggml_tensor * mm_model_ln_post_w;
|
||||
struct ggml_tensor * mm_model_ln_post_b;
|
||||
|
||||
// gemma3
|
||||
struct ggml_tensor * mm_input_proj_w;
|
||||
struct ggml_tensor * mm_soft_emb_norm_w;
|
||||
};
|
||||
|
||||
struct clip_ctx {
|
||||
@@ -588,7 +577,7 @@ struct clip_ctx {
|
||||
struct clip_vision_model vision_model;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
|
||||
int32_t max_feature_layer;
|
||||
int32_t max_feature_layer; // unused in newer models like gemma3
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
bool use_gelu = false;
|
||||
@@ -600,21 +589,209 @@ struct clip_ctx {
|
||||
bool has_post_norm = false;
|
||||
bool has_patch_bias = false;
|
||||
|
||||
struct gguf_context * ctx_gguf;
|
||||
struct ggml_context * ctx_data;
|
||||
struct gguf_context * ctx_gguf = nullptr;
|
||||
struct ggml_context * ctx_data = nullptr;
|
||||
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
|
||||
// memory buffers to evaluate the model
|
||||
ggml_backend_buffer_t params_buffer = NULL;
|
||||
std::vector<ggml_backend_t> backend_ptrs;
|
||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||
|
||||
ggml_backend_t backend = NULL;
|
||||
ggml_gallocr_t compute_alloc = NULL;
|
||||
ggml_backend_t backend = nullptr;
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
ggml_backend_buffer_t buf = nullptr;
|
||||
|
||||
struct clip_image_size * load_image_size;
|
||||
ggml_backend_sched_ptr sched;
|
||||
|
||||
struct clip_image_size * load_image_size = nullptr;
|
||||
|
||||
clip_ctx(clip_context_params & ctx_params) {
|
||||
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
||||
backend = ctx_params.use_gpu
|
||||
? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
|
||||
: nullptr;
|
||||
|
||||
if (backend) {
|
||||
LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
|
||||
backend_ptrs.push_back(backend);
|
||||
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
|
||||
} else {
|
||||
backend = backend_cpu;
|
||||
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
||||
}
|
||||
|
||||
backend_ptrs.push_back(backend_cpu);
|
||||
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
|
||||
|
||||
sched.reset(
|
||||
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
|
||||
);
|
||||
}
|
||||
|
||||
~clip_ctx() {
|
||||
ggml_free(ctx_data);
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_backend_buffer_free(buf);
|
||||
ggml_backend_free(backend);
|
||||
if (backend_cpu != backend) {
|
||||
ggml_backend_free(backend_cpu);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||
static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
||||
const auto & model = ctx->vision_model;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int image_size = hparams.image_size;
|
||||
int image_size_width = image_size;
|
||||
int image_size_height = image_size;
|
||||
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||
const int hidden_size = hparams.hidden_size;
|
||||
const int n_head = hparams.n_head;
|
||||
const int d_head = hidden_size / n_head;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const float eps = hparams.eps;
|
||||
|
||||
GGML_ASSERT(imgs->size == 1); // batch_size == 1
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
||||
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
// input raw
|
||||
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
|
||||
ggml_set_name(inp_raw, "inp_raw");
|
||||
ggml_set_input(inp_raw);
|
||||
|
||||
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
|
||||
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
|
||||
// position embeddings
|
||||
struct ggml_tensor * embeddings = ggml_add(ctx0, inp, model.position_embeddings);
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
||||
|
||||
// layernorm1
|
||||
{
|
||||
cur = ggml_norm(ctx0, cur, eps);
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), model.layers[il].ln_1_b);
|
||||
}
|
||||
|
||||
// self-attention
|
||||
{
|
||||
|
||||
struct ggml_tensor * Q =
|
||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
|
||||
|
||||
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
|
||||
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
||||
|
||||
struct ggml_tensor * K =
|
||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
|
||||
|
||||
K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
|
||||
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||
|
||||
struct ggml_tensor * V =
|
||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
|
||||
|
||||
V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
|
||||
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
||||
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
|
||||
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
||||
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
||||
KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
|
||||
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
|
||||
cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
|
||||
}
|
||||
|
||||
// attention output
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
|
||||
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, embeddings);
|
||||
|
||||
embeddings = cur; // embeddings = residual, cur = hidden_states
|
||||
|
||||
// layernorm2
|
||||
{
|
||||
cur = ggml_norm(ctx0, cur, eps);
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
|
||||
}
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
|
||||
|
||||
// siglip uses gelu
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, embeddings, cur);
|
||||
|
||||
embeddings = cur;
|
||||
}
|
||||
|
||||
// post-layernorm
|
||||
if (ctx->has_post_norm) {
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
ggml_set_name(embeddings, "post_ln");
|
||||
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
|
||||
}
|
||||
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
const int batch_size = 1;
|
||||
const int mm_tokens_per_image = 256; // default value for gemma3
|
||||
const int tokens_per_side = sqrt(mm_tokens_per_image);
|
||||
const int patches_per_image = sqrt(num_patches);
|
||||
const int kernel_size = patches_per_image / tokens_per_side;
|
||||
|
||||
embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
|
||||
embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size);
|
||||
|
||||
// doing a pool2d to reduce the number of output tokens to 256
|
||||
embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size);
|
||||
embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
|
||||
|
||||
// apply norm before projection
|
||||
embeddings = ggml_rms_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_mul(ctx0, embeddings, model.mm_soft_emb_norm_w);
|
||||
|
||||
// apply projection
|
||||
embeddings = ggml_mul_mat(ctx0,
|
||||
ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
|
||||
embeddings);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||
return nullptr;
|
||||
@@ -1160,7 +1337,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
} else {
|
||||
GGML_ABORT("fatel error");
|
||||
}
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
|
||||
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
@@ -1182,8 +1360,25 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
return gf;
|
||||
}
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
return clip_image_build_graph_siglip(ctx, imgs);
|
||||
} else {
|
||||
// TODO: we should have one build_* function per model
|
||||
return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
|
||||
}
|
||||
}
|
||||
|
||||
// read and create ggml_context containing the tensors and their data
|
||||
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
return clip_init(fname, clip_context_params{
|
||||
/* use_gpu */ true,
|
||||
/* verbosity */ verbosity,
|
||||
});
|
||||
}
|
||||
|
||||
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
|
||||
int verbosity = ctx_params.verbosity;
|
||||
struct ggml_context * meta = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
@@ -1277,7 +1472,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
}
|
||||
|
||||
clip_ctx * new_clip = new clip_ctx{};
|
||||
clip_ctx * new_clip = new clip_ctx(ctx_params);
|
||||
|
||||
// update projector type
|
||||
{
|
||||
@@ -1296,36 +1491,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
}
|
||||
|
||||
//#ifdef GGML_USE_CUDA
|
||||
// new_clip->backend = ggml_backend_cuda_init(0);
|
||||
// LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_METAL
|
||||
// new_clip->backend = ggml_backend_metal_init();
|
||||
// LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_CANN
|
||||
// new_clip->backend = ggml_backend_cann_init(0);
|
||||
// LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_VULKAN
|
||||
// new_clip->backend = ggml_backend_vk_init(0);
|
||||
// LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_SYCL
|
||||
// new_clip->backend = ggml_backend_sycl_init(0);
|
||||
// LOG_INF("%s: CLIP using SYCL backend\n", __func__);
|
||||
//#endif
|
||||
|
||||
if (!new_clip->backend) {
|
||||
new_clip->backend = ggml_backend_cpu_init();
|
||||
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
||||
}
|
||||
|
||||
// model size and capabilities
|
||||
{
|
||||
int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
|
||||
@@ -1363,8 +1528,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
GGML_ASSERT(new_clip->has_vision_encoder);
|
||||
GGML_ASSERT(!new_clip->has_text_encoder);
|
||||
|
||||
idx = get_key_idx(ctx, KEY_USE_GELU);
|
||||
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
||||
try {
|
||||
idx = get_key_idx(ctx, KEY_USE_GELU);
|
||||
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
||||
} catch (std::runtime_error & /*e*/) {
|
||||
new_clip->use_gelu = false;
|
||||
}
|
||||
|
||||
try {
|
||||
idx = get_key_idx(ctx, KEY_USE_SILU);
|
||||
@@ -1378,6 +1547,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
||||
LOG_INF("%s: minicpmv_version: %d\n", __func__, new_clip->minicpmv_version);
|
||||
LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector);
|
||||
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||
@@ -1420,7 +1590,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
|
||||
// alloc memory and offload data
|
||||
new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend);
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend);
|
||||
new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft);
|
||||
ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
|
||||
@@ -1433,7 +1605,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
return nullptr;
|
||||
}
|
||||
int num_bytes = ggml_nbytes(cur);
|
||||
if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
// for the CPU and Metal backend, we can read directly into the tensor
|
||||
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
||||
} else {
|
||||
@@ -1569,11 +1741,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||
vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||
} catch(const std::exception& /*e*/) {
|
||||
vision_model.patch_embeddings_0 = nullptr;
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||
} catch(const std::exception& /*e*/) {
|
||||
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
|
||||
vision_model.position_embeddings = nullptr;
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
|
||||
} catch(const std::exception& /*e*/) {
|
||||
@@ -1684,6 +1862,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
}
|
||||
else if (new_clip->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
vision_model.mm_input_proj_w = get_tensor(new_clip->ctx_data, TN_MM_INP_PROJ);
|
||||
vision_model.mm_soft_emb_norm_w = get_tensor(new_clip->ctx_data, TN_MM_SOFT_EMB_N);
|
||||
}
|
||||
else {
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
@@ -1719,14 +1901,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
// measure mem requirement and allocate
|
||||
{
|
||||
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
|
||||
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
||||
clip_image_f32_batch batch;
|
||||
batch.size = 1;
|
||||
batch.data = nullptr;
|
||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||
ggml_backend_sched_reserve(new_clip->sched.get(), gf);
|
||||
for (size_t i = 0; i < new_clip->backend_ptrs.size(); ++i) {
|
||||
ggml_backend_t backend = new_clip->backend_ptrs[i];
|
||||
ggml_backend_buffer_type_t buft = new_clip->backend_buft[i];
|
||||
size_t size = ggml_backend_sched_get_buffer_size(new_clip->sched.get(), backend);
|
||||
if (size > 1) {
|
||||
LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_buft_name(buft),
|
||||
size / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new_clip;
|
||||
@@ -2218,7 +2407,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ctx->has_glm_projector) {
|
||||
if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
res_imgs->size = 1;
|
||||
res_imgs->data = new clip_image_f32[res_imgs->size];
|
||||
clip_image_u8 resized_image;
|
||||
@@ -2407,12 +2596,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
void clip_free(clip_ctx * ctx) {
|
||||
ggml_free(ctx->ctx_data);
|
||||
gguf_free(ctx->ctx_gguf);
|
||||
|
||||
ggml_backend_buffer_free(ctx->params_buffer);
|
||||
ggml_backend_free(ctx->backend);
|
||||
ggml_gallocr_free(ctx->compute_alloc);
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
@@ -2608,8 +2791,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
|
||||
// build the inference graph
|
||||
ggml_backend_sched_reset(ctx->sched.get());
|
||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
|
||||
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
||||
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
|
||||
|
||||
// set inputs
|
||||
const auto & model = ctx->vision_model;
|
||||
@@ -2748,6 +2932,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||
free(positions_data);
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
// do nothing
|
||||
}
|
||||
else {
|
||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||
|
||||
@@ -2774,11 +2961,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
}
|
||||
|
||||
if (ggml_backend_is_cpu(ctx->backend)) {
|
||||
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
||||
}
|
||||
ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
|
||||
|
||||
ggml_backend_graph_compute(ctx->backend, gf);
|
||||
auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
|
||||
if (status != GGML_STATUS_SUCCESS) {
|
||||
LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
|
||||
return false;
|
||||
}
|
||||
|
||||
// the last node is the embedding tensor
|
||||
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||
@@ -2958,6 +3147,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
return ctx->vision_model.mm_1_b->ne[0];
|
||||
}
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
return ctx->vision_model.mm_input_proj_w->ne[0];
|
||||
}
|
||||
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
|
||||
@@ -39,8 +39,15 @@ struct clip_image_f32_batch {
|
||||
size_t size;
|
||||
};
|
||||
|
||||
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
|
||||
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
|
||||
struct clip_context_params {
|
||||
bool use_gpu;
|
||||
int verbosity;
|
||||
};
|
||||
|
||||
// deprecated, use clip_init
|
||||
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
|
||||
|
||||
CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
|
||||
|
||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
|
||||
@@ -89,6 +89,7 @@ def bytes_to_unicode():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
|
||||
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
|
||||
ap.add_argument('--bigendian', action="store_true", default=False, help="Model is executed on big-endian machine")
|
||||
ap.add_argument("--text-only", action="store_true", required=False,
|
||||
help="Save a text-only model. It can't be used to encode images")
|
||||
ap.add_argument("--vision-only", action="store_true", required=False,
|
||||
@@ -191,7 +192,7 @@ output_dir = args.output_dir if args.output_dir is not None else dir_model
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
|
||||
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
|
||||
fout = GGUFWriter(path=fname_out, arch="clip")
|
||||
fout = GGUFWriter(path=fname_out, arch="clip", endianess=GGUFEndian.LITTLE if not args.bigendian else GGUFEndian.BIG)
|
||||
|
||||
fout.add_bool("clip.has_text_encoder", has_text_encoder)
|
||||
fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
|
||||
|
||||
341
examples/llava/gemma3-cli.cpp
Normal file
341
examples/llava/gemma3-cli.cpp
Normal file
@@ -0,0 +1,341 @@
|
||||
#include "arg.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "clip.h"
|
||||
#include "stb_image.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
#include "console.h"
|
||||
|
||||
#include <vector>
|
||||
#include <limits.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
#elif defined (_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
static bool g_is_generating = false;
|
||||
|
||||
/**
|
||||
* Please note that this is NOT a production-ready stuff.
|
||||
* It is a playground for trying Gemma 3 vision capabilities.
|
||||
* For contributors: please keep this code simple and easy to understand.
|
||||
*/
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG(
|
||||
"Experimental CLI for using Gemma 3 vision model\n\n"
|
||||
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
|
||||
" -m and --mmproj are required\n"
|
||||
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
|
||||
argv[0]
|
||||
);
|
||||
}
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
static void sigint_handler(int signo) {
|
||||
if (signo == SIGINT) {
|
||||
if (g_is_generating) {
|
||||
g_is_generating = false;
|
||||
} else {
|
||||
console::cleanup();
|
||||
LOG("\nInterrupted by user\n");
|
||||
_exit(130);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
struct gemma3_context {
|
||||
struct clip_ctx * ctx_clip = NULL;
|
||||
common_init_result llama_init;
|
||||
|
||||
llama_model * model;
|
||||
llama_context * lctx;
|
||||
const llama_vocab * vocab;
|
||||
llama_batch batch;
|
||||
|
||||
int n_threads = 1;
|
||||
llama_pos n_past = 0;
|
||||
|
||||
gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) {
|
||||
model = llama_init.model.get();
|
||||
lctx = llama_init.context.get();
|
||||
vocab = llama_model_get_vocab(model);
|
||||
n_threads = params.cpuparams.n_threads;
|
||||
batch = llama_batch_init(params.n_batch, 0, 1);
|
||||
init_clip_model(params);
|
||||
}
|
||||
|
||||
void init_clip_model(common_params & params) {
|
||||
const char * clip_path = params.mmproj.c_str();
|
||||
ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
|
||||
}
|
||||
|
||||
~gemma3_context() {
|
||||
clip_free(ctx_clip);
|
||||
}
|
||||
};
|
||||
|
||||
struct decode_embd_batch {
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id> seq_id_0;
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
logits .resize(n_tokens);
|
||||
seq_id_0.resize(1);
|
||||
seq_id_0[0] = seq_id;
|
||||
seq_ids [n_tokens] = nullptr;
|
||||
batch = {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
/*logits =*/ logits.data(),
|
||||
};
|
||||
for (int i = 0; i < n_tokens; i++) {
|
||||
batch.pos [i] = pos_0 + i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
|
||||
llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
|
||||
common_batch_clear(ctx.batch);
|
||||
for (llama_token & t : tokens) {
|
||||
common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
|
||||
}
|
||||
if (logits_last) {
|
||||
ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
|
||||
}
|
||||
// LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
|
||||
if (llama_decode(ctx.lctx, ctx.batch)) {
|
||||
LOG_ERR("Failed to decode text\n");
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int eval_image(gemma3_context & ctx, std::string & fname) {
|
||||
std::vector<float> image_embd_v;
|
||||
int n_embd = llama_model_n_embd(ctx.model);
|
||||
int n_tokens = 256;
|
||||
image_embd_v.resize(n_tokens * n_embd);
|
||||
|
||||
bool ok;
|
||||
struct clip_image_u8 * img_u8 = clip_image_u8_init();
|
||||
ok = clip_image_load_from_file(fname.c_str(), img_u8);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to load image %s\n", fname.c_str());
|
||||
clip_image_u8_free(img_u8);
|
||||
return 2; // non-fatal error
|
||||
}
|
||||
|
||||
clip_image_f32_batch batch_f32;
|
||||
ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess image\n");
|
||||
clip_image_f32_batch_free(&batch_f32);
|
||||
clip_image_u8_free(img_u8);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int64_t t0 = ggml_time_ms();
|
||||
LOG("Encoding image %s\n", fname.c_str());
|
||||
ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data());
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to encode image\n");
|
||||
clip_image_f32_batch_free(&batch_f32);
|
||||
clip_image_u8_free(img_u8);
|
||||
return 1;
|
||||
}
|
||||
LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
||||
|
||||
clip_image_f32_batch_free(&batch_f32);
|
||||
clip_image_u8_free(img_u8);
|
||||
|
||||
// decode image embeddings
|
||||
int64_t t1 = ggml_time_ms();
|
||||
eval_text(ctx, "<start_of_image>");
|
||||
llama_set_causal_attn(ctx.lctx, false);
|
||||
decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
|
||||
if (llama_decode(ctx.lctx, batch_img.batch)) {
|
||||
LOG_ERR("failed to decode image\n");
|
||||
return 1;
|
||||
}
|
||||
ctx.n_past += n_tokens;
|
||||
llama_set_causal_attn(ctx.lctx, true);
|
||||
eval_text(ctx, "<end_of_image>");
|
||||
LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
|
||||
for (int i = 0; i < n_predict; i++) {
|
||||
if (i > n_predict || !g_is_generating) {
|
||||
printf("\n");
|
||||
break;
|
||||
}
|
||||
|
||||
llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
|
||||
common_sampler_accept(smpl, token_id, true);
|
||||
|
||||
if (llama_vocab_is_eog(ctx.vocab, token_id)) {
|
||||
printf("\n");
|
||||
break; // end of generation
|
||||
}
|
||||
|
||||
printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
|
||||
fflush(stdout);
|
||||
|
||||
// eval the token
|
||||
common_batch_clear(ctx.batch);
|
||||
common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
|
||||
if (llama_decode(ctx.lctx, ctx.batch)) {
|
||||
LOG_ERR("failed to decode token\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
common_params params;
|
||||
params.sampling.temp = 0.2; // lower temp by default for better quality
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.empty()) {
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
gemma3_context ctx(params);
|
||||
printf("%s: %s\n", __func__, params.model.c_str());
|
||||
|
||||
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
|
||||
|
||||
struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
|
||||
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
|
||||
|
||||
// ctrl+C handling
|
||||
{
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = sigint_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (eval_text(ctx, "<bos>")) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (is_single_turn) {
|
||||
g_is_generating = true;
|
||||
if (eval_text(ctx, "<start_of_turn>user\n")) {
|
||||
return 1;
|
||||
}
|
||||
for (auto & fname : params.image) {
|
||||
if (eval_image(ctx, fname)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (eval_text(ctx, params.prompt + "<end_of_turn><start_of_turn>model\n", true)) {
|
||||
return 1;
|
||||
}
|
||||
if (generate_response(ctx, smpl, n_predict)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
LOG("\n Running in chat mode, available commands:");
|
||||
LOG("\n /image <path> load an image");
|
||||
LOG("\n /clear clear the chat history");
|
||||
LOG("\n /quit or /exit exit the program");
|
||||
LOG("\n");
|
||||
|
||||
if (eval_text(ctx, "<start_of_turn>user\n")) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
g_is_generating = false;
|
||||
LOG("\n> ");
|
||||
console::set_display(console::user_input);
|
||||
std::string line;
|
||||
console::readline(line, false);
|
||||
console::set_display(console::reset);
|
||||
line = string_strip(line);
|
||||
if (line.empty()) {
|
||||
continue;
|
||||
}
|
||||
if (line == "/quit" || line == "/exit") {
|
||||
break;
|
||||
}
|
||||
if (line == "/clear") {
|
||||
ctx.n_past = 0;
|
||||
llama_kv_self_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
|
||||
LOG("Chat history cleared\n\n");
|
||||
continue;
|
||||
}
|
||||
g_is_generating = true;
|
||||
if (line.find("/image") == 0) {
|
||||
std::string image = line.substr(7);
|
||||
int res = eval_image(ctx, image);
|
||||
if (res == 2) {
|
||||
continue; // image not found
|
||||
}
|
||||
if (res) {
|
||||
return 1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (eval_text(ctx, line + "<end_of_turn><start_of_turn>model\n", true)) {
|
||||
return 1;
|
||||
}
|
||||
if (generate_response(ctx, smpl, n_predict)) {
|
||||
return 1;
|
||||
}
|
||||
if (eval_text(ctx, "<end_of_turn><start_of_turn>user\n")) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
307
examples/llava/gemma3_convert_encoder_to_gguf.py
Normal file
307
examples/llava/gemma3_convert_encoder_to_gguf.py
Normal file
@@ -0,0 +1,307 @@
|
||||
import gguf
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
import torch
|
||||
import json
|
||||
import os
|
||||
import numpy as np
|
||||
from typing import cast, ContextManager, Any, Iterator
|
||||
from pathlib import Path
|
||||
from torch import Tensor
|
||||
|
||||
logger = logging.getLogger("gemma3-mmproj")
|
||||
|
||||
|
||||
# (copied from convert_hf_to_gguf.py)
|
||||
# tree of lazy tensors
|
||||
class LazyTorchTensor(gguf.LazyBase):
|
||||
_tensor_type = torch.Tensor
|
||||
# to keep the type-checker happy
|
||||
dtype: torch.dtype
|
||||
shape: torch.Size
|
||||
|
||||
# only used when converting a torch.Tensor to a np.ndarray
|
||||
_dtype_map: dict[torch.dtype, type] = {
|
||||
torch.float16: np.float16,
|
||||
torch.float32: np.float32,
|
||||
}
|
||||
|
||||
# used for safetensors slices
|
||||
# ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
|
||||
# TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
|
||||
_dtype_str_map: dict[str, torch.dtype] = {
|
||||
"F64": torch.float64,
|
||||
"F32": torch.float32,
|
||||
"BF16": torch.bfloat16,
|
||||
"F16": torch.float16,
|
||||
# "U64": torch.uint64,
|
||||
"I64": torch.int64,
|
||||
# "U32": torch.uint32,
|
||||
"I32": torch.int32,
|
||||
# "U16": torch.uint16,
|
||||
"I16": torch.int16,
|
||||
"U8": torch.uint8,
|
||||
"I8": torch.int8,
|
||||
"BOOL": torch.bool,
|
||||
"F8_E4M3": torch.float8_e4m3fn,
|
||||
"F8_E5M2": torch.float8_e5m2,
|
||||
}
|
||||
|
||||
def numpy(self) -> gguf.LazyNumpyTensor:
|
||||
dtype = self._dtype_map[self.dtype]
|
||||
return gguf.LazyNumpyTensor(
|
||||
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
||||
args=(self,),
|
||||
func=(lambda s: s.numpy())
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
|
||||
return torch.empty(size=shape, dtype=dtype, device="meta")
|
||||
|
||||
@classmethod
|
||||
def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
|
||||
dtype = cls._dtype_str_map[st_slice.get_dtype()]
|
||||
shape: tuple[int, ...] = tuple(st_slice.get_shape())
|
||||
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
|
||||
return cast(torch.Tensor, lazy)
|
||||
|
||||
@classmethod
|
||||
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
||||
del types # unused
|
||||
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
|
||||
if func is torch.Tensor.numpy:
|
||||
return args[0].numpy()
|
||||
|
||||
return cls._wrap_fn(func)(*args, **kwargs)
|
||||
|
||||
|
||||
class Gemma3VisionTower:
|
||||
hparams: dict
|
||||
gguf_writer: gguf.GGUFWriter
|
||||
fname_out: Path
|
||||
ftype: gguf.LlamaFileType
|
||||
|
||||
@staticmethod
|
||||
def load_hparams(dir_model: Path):
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
@staticmethod
|
||||
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
|
||||
part_names: list[str] = []
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith(prefix) and filename.endswith(suffix):
|
||||
part_names.append(filename)
|
||||
part_names.sort()
|
||||
return part_names
|
||||
|
||||
def __init__(self,
|
||||
dir_model: Path,
|
||||
fname_out: Path,
|
||||
ftype: gguf.LlamaFileType,
|
||||
is_big_endian: bool,):
|
||||
hparams = Gemma3VisionTower.load_hparams(dir_model)
|
||||
self.hparams = hparams
|
||||
self.fname_out = fname_out
|
||||
self.ftype = ftype
|
||||
endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||||
self.gguf_writer = gguf.GGUFWriter(path=None, arch="clip", endianess=endianess)
|
||||
|
||||
text_config = hparams["text_config"]
|
||||
vision_config = hparams["vision_config"]
|
||||
|
||||
assert hparams["architectures"][0] == "Gemma3ForConditionalGeneration"
|
||||
assert text_config is not None
|
||||
assert vision_config is not None
|
||||
|
||||
self.gguf_writer.add_string ("clip.projector_type", "gemma3")
|
||||
self.gguf_writer.add_bool ("clip.has_text_encoder", False)
|
||||
self.gguf_writer.add_bool ("clip.has_vision_encoder", True)
|
||||
self.gguf_writer.add_bool ("clip.has_llava_projector", False) # legacy
|
||||
self.gguf_writer.add_uint32 ("clip.vision.image_size", vision_config["image_size"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.patch_size", vision_config["patch_size"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.embedding_length", vision_config["hidden_size"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.feed_forward_length", vision_config["intermediate_size"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.projection_dim", text_config["hidden_size"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.block_count", vision_config["num_hidden_layers"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.attention.head_count", vision_config["num_attention_heads"])
|
||||
self.gguf_writer.add_float32("clip.vision.attention.layer_norm_epsilon", vision_config.get("layer_norm_eps", 1e-6))
|
||||
# default values taken from HF tranformers code
|
||||
self.gguf_writer.add_array ("clip.vision.image_mean", [0.5, 0.5, 0.5])
|
||||
self.gguf_writer.add_array ("clip.vision.image_std", [0.5, 0.5, 0.5])
|
||||
self.gguf_writer.add_bool ("clip.use_gelu", True)
|
||||
|
||||
# load tensors
|
||||
for name, data_torch in self.get_tensors(dir_model):
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
self.add_tensor(name, data_torch)
|
||||
|
||||
def get_tensors(self, dir_model: Path) -> Iterator[tuple[str, Tensor]]:
|
||||
part_names = Gemma3VisionTower.get_model_part_names(dir_model, "model", ".safetensors")
|
||||
tensor_names_from_parts: set[str] = set()
|
||||
for part_name in part_names:
|
||||
logger.info(f"gguf: loading model part '{part_name}'")
|
||||
from safetensors import safe_open
|
||||
ctx = cast(ContextManager[Any], safe_open(dir_model / part_name, framework="pt", device="cpu"))
|
||||
with ctx as model_part:
|
||||
tensor_names_from_parts.update(model_part.keys())
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part.get_slice(name)
|
||||
data = LazyTorchTensor.from_safetensors_slice(data)
|
||||
yield name, data
|
||||
|
||||
def add_tensor(self, name: str, data_torch: Tensor):
|
||||
is_1d = len(data_torch.shape) == 1
|
||||
is_embd = ".embeddings." in name
|
||||
old_dtype = data_torch.dtype
|
||||
can_quantize = not is_1d and not is_embd
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
|
||||
# this is to support old checkpoint
|
||||
# TODO: remove this when we have the final model
|
||||
name = name.replace("vision_model.vision_model.", "vision_tower.vision_model.")
|
||||
name = name.replace("multimodal_projector.", "multi_modal_projector.")
|
||||
|
||||
# filter only vision tensors
|
||||
if not name.startswith("vision_tower.vision_model.") and not name.startswith("multi_modal_projector."):
|
||||
return
|
||||
# prefix
|
||||
name = name.replace("vision_tower.vision_model.encoder.layers.", "v.blk.")
|
||||
name = name.replace("vision_tower.vision_model.", "v.")
|
||||
# projector and input embd
|
||||
name = name.replace(".embeddings.patch_embedding.", ".patch_embd.")
|
||||
name = name.replace(".embeddings.position_embedding.", ".position_embd.")
|
||||
name = name.replace(
|
||||
"multi_modal_projector.mm_input_projection_weight",
|
||||
"mm.input_projection.weight"
|
||||
)
|
||||
name = name.replace(
|
||||
"multi_modal_projector.mm_soft_emb_norm.weight",
|
||||
"mm.soft_emb_norm.weight"
|
||||
)
|
||||
name = name.replace("post_layernorm.", "post_ln.")
|
||||
# each block
|
||||
name = name.replace(".self_attn.k_proj.", ".attn_k.")
|
||||
name = name.replace(".self_attn.v_proj.", ".attn_v.")
|
||||
name = name.replace(".self_attn.q_proj.", ".attn_q.")
|
||||
name = name.replace(".self_attn.out_proj.", ".attn_out.")
|
||||
name = name.replace(".layer_norm1.", ".ln1.")
|
||||
name = name.replace(".layer_norm2.", ".ln2.")
|
||||
name = name.replace(".mlp.fc1.", ".ffn_down.")
|
||||
name = name.replace(".mlp.fc2.", ".ffn_up.")
|
||||
|
||||
if can_quantize:
|
||||
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {self.ftype}")
|
||||
|
||||
# corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
|
||||
# the other norm values are part of SigLIP model, and they are already correct
|
||||
# ref code: Gemma3RMSNorm
|
||||
if "soft_emb_norm.weight" in name:
|
||||
logger.info(f"Correcting norm value for '{name}'")
|
||||
data_torch = data_torch + 1
|
||||
|
||||
data = data_torch.numpy()
|
||||
|
||||
try:
|
||||
data = gguf.quants.quantize(data, data_qtype)
|
||||
except Exception as e:
|
||||
logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16")
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
data = gguf.quants.quantize(data, data_qtype)
|
||||
|
||||
# reverse shape to make it similar to the internal ggml dimension order
|
||||
shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}"
|
||||
logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||
|
||||
self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
|
||||
|
||||
def write(self):
|
||||
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
||||
self.gguf_writer.write_kv_data_to_file()
|
||||
self.gguf_writer.write_tensors_to_file(progress=True)
|
||||
self.gguf_writer.close()
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert Gemma 3 vision tower safetensors to GGUF format",)
|
||||
parser.add_argument(
|
||||
"--outfile", type=Path, default="mmproj.gguf",
|
||||
help="path to write to",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16",
|
||||
help="output format",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bigendian", action="store_true",
|
||||
help="model is executed on big endian machine",
|
||||
)
|
||||
parser.add_argument(
|
||||
"model", type=Path,
|
||||
help="directory containing model file",
|
||||
nargs="?",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose", action="store_true",
|
||||
help="increase output verbosity",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.model is None:
|
||||
parser.error("the following arguments are required: model")
|
||||
return args
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
dir_model = args.model
|
||||
|
||||
if not dir_model.is_dir():
|
||||
logger.error(f'Error: {args.model} is not a directory')
|
||||
sys.exit(1)
|
||||
|
||||
ftype_map: dict[str, gguf.LlamaFileType] = {
|
||||
"f32": gguf.LlamaFileType.ALL_F32,
|
||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||
}
|
||||
|
||||
logger.info(f"Loading model: {dir_model.name}")
|
||||
|
||||
with torch.inference_mode():
|
||||
gemma3_vision_tower = Gemma3VisionTower(
|
||||
dir_model=dir_model,
|
||||
fname_out=args.outfile,
|
||||
ftype=ftype_map[args.outtype],
|
||||
is_big_endian=args.bigendian,
|
||||
)
|
||||
gemma3_vision_tower.write()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
@@ -86,7 +86,11 @@ static struct clip_ctx * clip_init_context(common_params * params) {
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||
struct clip_context_params clip_params = {
|
||||
/* use_gpu */ params->n_gpu_layers != 0,
|
||||
/* verbosity */ params->verbosity,
|
||||
};
|
||||
auto * ctx_clip = clip_init(clip_path, clip_params);
|
||||
return ctx_clip;
|
||||
}
|
||||
|
||||
@@ -148,19 +152,34 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
if (num_image_embeds > 1) {
|
||||
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
||||
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
||||
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
if (j == num_image_embeds_col - 1) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
||||
if (has_minicpmv_projector == 2) {
|
||||
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
||||
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
||||
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
if (j == num_image_embeds_col - 1) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) {
|
||||
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
||||
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
||||
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||
if (j == num_image_embeds_col - 1) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||
}
|
||||
|
||||
@@ -597,7 +597,6 @@ elif args.minicpmv_projector is not None:
|
||||
fname_middle = "mmproj-"
|
||||
has_text_encoder = False
|
||||
has_minicpmv_projector = True
|
||||
minicpmv_version = 4
|
||||
elif args.vision_only:
|
||||
fname_middle = "vision-"
|
||||
has_text_encoder = False
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
struct ngram_data {
|
||||
bool active = false;
|
||||
@@ -95,7 +96,7 @@ int main(int argc, char ** argv) {
|
||||
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
|
||||
|
||||
for (int s = 1; s < W + G + 1; ++s) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
|
||||
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
|
||||
}
|
||||
|
||||
const auto t_enc_end = ggml_time_us();
|
||||
@@ -437,17 +438,17 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// KV cache management
|
||||
// if no verification token matched, we simply remove all cells from this batch -> no fragmentation
|
||||
llama_kv_cache_seq_rm(ctx, -1, n_past, -1);
|
||||
llama_kv_self_seq_rm(ctx, -1, n_past, -1);
|
||||
|
||||
if (seq_id_best != 0) {
|
||||
// if a verification token matched, we keep the best sequence and remove the rest
|
||||
// this leads to some KV cache fragmentation
|
||||
llama_kv_cache_seq_keep(ctx, seq_id_best);
|
||||
llama_kv_cache_seq_cp (ctx, seq_id_best, 0, -1, -1);
|
||||
llama_kv_cache_seq_rm (ctx, seq_id_best, -1, -1);
|
||||
llama_kv_self_seq_keep(ctx, seq_id_best);
|
||||
llama_kv_self_seq_cp (ctx, seq_id_best, 0, -1, -1);
|
||||
llama_kv_self_seq_rm (ctx, seq_id_best, -1, -1);
|
||||
|
||||
for (int s = 1; s < W + G + 1; ++s) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
|
||||
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -192,7 +192,7 @@ int main(int argc, char ** argv){
|
||||
|
||||
// KV cache management
|
||||
// clean the cache of draft tokens that weren't accepted
|
||||
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||
llama_kv_self_seq_rm(ctx, 0, n_past, -1);
|
||||
|
||||
common_batch_clear(batch_tgt);
|
||||
common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
||||
|
||||
@@ -45,8 +45,8 @@ static void print_usage(int argc, char ** argv) {
|
||||
(void) argc;
|
||||
|
||||
LOG("\nexample usage:\n");
|
||||
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
|
||||
LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
|
||||
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128 -no-cnv\n", argv[0]);
|
||||
LOG("\n chat (conversation): %s -m your_model.gguf -sys \"You are a helpful assistant\"\n", argv[0]);
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
@@ -217,8 +217,8 @@ int main(int argc, char ** argv) {
|
||||
// print chat template example in conversation mode
|
||||
if (params.conversation_mode) {
|
||||
if (params.enable_chat_template) {
|
||||
if (!params.prompt.empty()) {
|
||||
LOG_WRN("*** User-specified prompt in conversation mode will be ignored, did you mean to set --system-prompt (-sys) instead?\n");
|
||||
if (!params.prompt.empty() && params.system_prompt.empty()) {
|
||||
LOG_WRN("*** User-specified prompt will pre-start conversation, did you mean to set --system-prompt (-sys) instead?\n");
|
||||
}
|
||||
|
||||
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
|
||||
@@ -265,7 +265,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
||||
bool waiting_for_first_input = params.conversation_mode && params.enable_chat_template && params.system_prompt.empty();
|
||||
bool waiting_for_first_input = false;
|
||||
auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
|
||||
common_chat_msg new_msg;
|
||||
new_msg.role = role;
|
||||
@@ -276,22 +276,34 @@ int main(int argc, char ** argv) {
|
||||
return formatted;
|
||||
};
|
||||
|
||||
std::string prompt;
|
||||
{
|
||||
std::string prompt;
|
||||
|
||||
if (params.conversation_mode && params.enable_chat_template) {
|
||||
// format the system prompt in conversation mode (will use template default if empty)
|
||||
prompt = params.system_prompt;
|
||||
if (!params.system_prompt.empty()) {
|
||||
// format the system prompt (will use template default if empty)
|
||||
chat_add_and_format("system", params.system_prompt);
|
||||
}
|
||||
|
||||
if (!prompt.empty()) {
|
||||
prompt = chat_add_and_format("system", prompt);
|
||||
if (!params.prompt.empty()) {
|
||||
// format and append the user prompt
|
||||
chat_add_and_format("user", params.prompt);
|
||||
} else {
|
||||
waiting_for_first_input = true;
|
||||
}
|
||||
|
||||
if (!params.system_prompt.empty() || !params.prompt.empty()) {
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.messages = chat_msgs;
|
||||
inputs.add_generation_prompt = !params.prompt.empty();
|
||||
|
||||
prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt;
|
||||
}
|
||||
} else {
|
||||
// otherwise use the prompt as is
|
||||
prompt = params.prompt;
|
||||
}
|
||||
|
||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||
if (params.interactive_first || !prompt.empty() || session_tokens.empty()) {
|
||||
LOG_DBG("tokenize the prompt\n");
|
||||
embd_inp = common_tokenize(ctx, prompt, true, true);
|
||||
} else {
|
||||
@@ -304,7 +316,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// Should not run without any tokens
|
||||
if (!params.conversation_mode && embd_inp.empty()) {
|
||||
if (!waiting_for_first_input && embd_inp.empty()) {
|
||||
if (add_bos) {
|
||||
embd_inp.push_back(llama_vocab_bos(vocab));
|
||||
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
@@ -342,7 +354,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// remove any "future" tokens that we might have inherited from the previous session
|
||||
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
||||
llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
||||
}
|
||||
|
||||
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
|
||||
@@ -364,7 +376,12 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (params.conversation_mode) {
|
||||
params.interactive_first = true;
|
||||
if (params.single_turn && !params.prompt.empty()) {
|
||||
params.interactive = false;
|
||||
params.interactive_first = false;
|
||||
} else {
|
||||
params.interactive_first = true;
|
||||
}
|
||||
}
|
||||
|
||||
// enable interactive mode if interactive start is specified
|
||||
@@ -585,8 +602,8 @@ int main(int argc, char ** argv) {
|
||||
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
llama_kv_self_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
@@ -609,9 +626,9 @@ int main(int argc, char ** argv) {
|
||||
LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
||||
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
||||
|
||||
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||
llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
||||
llama_kv_self_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
||||
llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||
llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
||||
|
||||
n_past -= bd;
|
||||
|
||||
@@ -808,6 +825,11 @@ int main(int argc, char ** argv) {
|
||||
if (params.conversation_mode && !waiting_for_first_input) {
|
||||
const auto id = common_sampler_last(smpl);
|
||||
assistant_ss << common_token_to_piece(ctx, id, false);
|
||||
|
||||
if (!prompt.empty()) {
|
||||
prompt.clear();
|
||||
is_interacting = false;
|
||||
}
|
||||
}
|
||||
|
||||
if ((n_past > 0 || waiting_for_first_input) && is_interacting) {
|
||||
@@ -905,6 +927,11 @@ int main(int argc, char ** argv) {
|
||||
common_sampler_reset(smpl);
|
||||
}
|
||||
is_interacting = false;
|
||||
|
||||
if (waiting_for_first_input && params.single_turn) {
|
||||
params.interactive = false;
|
||||
params.interactive_first = false;
|
||||
}
|
||||
waiting_for_first_input = false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <ctime>
|
||||
#include <algorithm>
|
||||
|
||||
// trim whitespace from the beginning and end of a string
|
||||
static std::string trim(const std::string & str) {
|
||||
@@ -201,7 +202,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// assign the system KV cache to all parallel sequences
|
||||
for (int32_t i = 1; i <= n_clients; ++i) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
|
||||
}
|
||||
|
||||
LOG_INF("\n");
|
||||
@@ -233,9 +234,9 @@ int main(int argc, char ** argv) {
|
||||
if (batch.n_tokens == 0) {
|
||||
// all sequences have ended - clear the entire KV cache
|
||||
for (int i = 1; i <= n_clients; ++i) {
|
||||
llama_kv_cache_seq_rm(ctx, i, -1, -1);
|
||||
llama_kv_self_seq_rm(ctx, i, -1, -1);
|
||||
// but keep the system prompt
|
||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
|
||||
}
|
||||
|
||||
LOG_INF("%s: clearing the KV cache\n", __func__);
|
||||
@@ -371,8 +372,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
||||
llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
|
||||
llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
|
||||
llama_kv_self_seq_rm(ctx, client.id + 1, -1, -1);
|
||||
llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1);
|
||||
|
||||
const auto t_main_end = ggml_time_us();
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG("\nexample usage:\n");
|
||||
@@ -132,11 +133,11 @@ int main(int argc, char ** argv) {
|
||||
const int ib = i/n_batch - 1;
|
||||
const int bd = n_batch_grp*(n_grp - 1);
|
||||
|
||||
llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
llama_kv_cache_update (ctx);
|
||||
llama_kv_self_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
llama_kv_self_update (ctx);
|
||||
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
||||
}
|
||||
|
||||
common_batch_clear(batch);
|
||||
@@ -166,12 +167,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
//llama_kv_cache_defrag (ctx);
|
||||
llama_kv_cache_update (ctx);
|
||||
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
//llama_kv_self_defrag (ctx);
|
||||
llama_kv_self_update (ctx);
|
||||
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
||||
|
||||
common_batch_clear(batch);
|
||||
|
||||
@@ -197,12 +198,12 @@ int main(int argc, char ** argv) {
|
||||
if (n_discard > 0) {
|
||||
LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
//llama_kv_cache_defrag (ctx);
|
||||
llama_kv_cache_update (ctx);
|
||||
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
//llama_kv_self_defrag (ctx);
|
||||
llama_kv_self_update (ctx);
|
||||
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -361,7 +361,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
||||
|
||||
@@ -547,7 +547,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
@@ -924,7 +924,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
// decode all tasks [i0, i1)
|
||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||
@@ -1203,7 +1203,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
|
||||
return;
|
||||
}
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
// decode all tasks [i0, i1)
|
||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||
@@ -1575,7 +1575,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
|
||||
return;
|
||||
}
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
// decode all tasks [i0, i1)
|
||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||
@@ -1765,7 +1765,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
|
||||
}
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "llama-context.h"
|
||||
#include "llama-model.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <algorithm>
|
||||
@@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
const auto & tensors = llama_internal_get_tensor_map(ctx);
|
||||
const auto & tensors = llama_internal_get_tensor_map(model);
|
||||
|
||||
// check layer tensors
|
||||
int included_layers = 0;
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <unordered_map>
|
||||
#include <fstream>
|
||||
#include <cmath>
|
||||
#include <cctype>
|
||||
|
||||
struct quant_option {
|
||||
std::string name;
|
||||
|
||||
@@ -83,7 +83,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
||||
|
||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
// run model
|
||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
|
||||
@@ -79,6 +79,7 @@ class Opt {
|
||||
ctx_params = llama_context_default_params();
|
||||
model_params = llama_model_default_params();
|
||||
context_size_default = ctx_params.n_batch;
|
||||
n_threads_default = ctx_params.n_threads;
|
||||
ngl_default = model_params.n_gpu_layers;
|
||||
common_params_sampling sampling;
|
||||
temperature_default = sampling.temp;
|
||||
@@ -104,6 +105,7 @@ class Opt {
|
||||
|
||||
ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
|
||||
ctx_params.n_ctx = ctx_params.n_batch;
|
||||
ctx_params.n_threads = ctx_params.n_threads_batch = n_threads >= 0 ? n_threads : n_threads_default;
|
||||
model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
|
||||
temperature = temperature >= 0 ? temperature : temperature_default;
|
||||
|
||||
@@ -116,12 +118,12 @@ class Opt {
|
||||
std::string chat_template_file;
|
||||
std::string user;
|
||||
bool use_jinja = false;
|
||||
int context_size = -1, ngl = -1;
|
||||
int context_size = -1, ngl = -1, n_threads = -1;
|
||||
float temperature = -1;
|
||||
bool verbose = false;
|
||||
|
||||
private:
|
||||
int context_size_default = -1, ngl_default = -1;
|
||||
int context_size_default = -1, ngl_default = -1, n_threads_default = -1;
|
||||
float temperature_default = -1;
|
||||
bool help = false;
|
||||
|
||||
@@ -159,53 +161,94 @@ class Opt {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int parse_options_with_value(int argc, const char ** argv, int & i, bool & options_parsing) {
|
||||
if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
|
||||
if (handle_option_with_value(argc, argv, i, context_size) == 1) {
|
||||
return 1;
|
||||
}
|
||||
} else if (options_parsing &&
|
||||
(strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
|
||||
if (handle_option_with_value(argc, argv, i, ngl) == 1) {
|
||||
return 1;
|
||||
}
|
||||
} else if (options_parsing && (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--threads") == 0)) {
|
||||
if (handle_option_with_value(argc, argv, i, n_threads) == 1) {
|
||||
return 1;
|
||||
}
|
||||
} else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
|
||||
if (handle_option_with_value(argc, argv, i, temperature) == 1) {
|
||||
return 1;
|
||||
}
|
||||
} else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0) {
|
||||
if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
|
||||
return 1;
|
||||
}
|
||||
use_jinja = true;
|
||||
} else {
|
||||
return 2;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int parse_options(const char ** argv, int & i, bool & options_parsing) {
|
||||
if (options_parsing && (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
|
||||
verbose = true;
|
||||
} else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
|
||||
use_jinja = true;
|
||||
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
|
||||
help = true;
|
||||
return 0;
|
||||
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
|
||||
options_parsing = false;
|
||||
} else {
|
||||
return 2;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int parse_positional_args(const char ** argv, int & i, int & positional_args_i) {
|
||||
if (positional_args_i == 0) {
|
||||
if (!argv[i][0] || argv[i][0] == '-') {
|
||||
return 1;
|
||||
}
|
||||
|
||||
++positional_args_i;
|
||||
model_ = argv[i];
|
||||
} else if (positional_args_i == 1) {
|
||||
++positional_args_i;
|
||||
user = argv[i];
|
||||
} else {
|
||||
user += " " + std::string(argv[i]);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int parse(int argc, const char ** argv) {
|
||||
bool options_parsing = true;
|
||||
for (int i = 1, positional_args_i = 0; i < argc; ++i) {
|
||||
if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
|
||||
if (handle_option_with_value(argc, argv, i, context_size) == 1) {
|
||||
return 1;
|
||||
}
|
||||
} else if (options_parsing &&
|
||||
(strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
|
||||
if (handle_option_with_value(argc, argv, i, ngl) == 1) {
|
||||
return 1;
|
||||
}
|
||||
} else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
|
||||
if (handle_option_with_value(argc, argv, i, temperature) == 1) {
|
||||
return 1;
|
||||
}
|
||||
} else if (options_parsing &&
|
||||
(parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
|
||||
verbose = true;
|
||||
} else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
|
||||
use_jinja = true;
|
||||
} else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0){
|
||||
if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
|
||||
return 1;
|
||||
}
|
||||
use_jinja = true;
|
||||
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
|
||||
help = true;
|
||||
return 0;
|
||||
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
|
||||
options_parsing = false;
|
||||
} else if (positional_args_i == 0) {
|
||||
if (!argv[i][0] || argv[i][0] == '-') {
|
||||
return 1;
|
||||
}
|
||||
int ret = parse_options_with_value(argc, argv, i, options_parsing);
|
||||
if (ret == 0) {
|
||||
continue;
|
||||
} else if (ret == 1) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
++positional_args_i;
|
||||
model_ = argv[i];
|
||||
} else if (positional_args_i == 1) {
|
||||
++positional_args_i;
|
||||
user = argv[i];
|
||||
} else {
|
||||
user += " " + std::string(argv[i]);
|
||||
ret = parse_options(argv, i, options_parsing);
|
||||
if (ret == 0) {
|
||||
continue;
|
||||
} else if (ret == 1) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (parse_positional_args(argv, i, positional_args_i)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (model_.empty()){
|
||||
if (model_.empty()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -232,6 +275,8 @@ class Opt {
|
||||
" Number of GPU layers (default: %d)\n"
|
||||
" --temp <value>\n"
|
||||
" Temperature (default: %.1f)\n"
|
||||
" -t, --threads <value>\n"
|
||||
" Number of threads to use during generation (default: %d)\n"
|
||||
" -v, --verbose, --log-verbose\n"
|
||||
" Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
|
||||
" -h, --help\n"
|
||||
@@ -260,7 +305,7 @@ class Opt {
|
||||
" llama-run file://some-file3.gguf\n"
|
||||
" llama-run --ngl 999 some-file4.gguf\n"
|
||||
" llama-run --ngl 999 some-file5.gguf Hello World\n",
|
||||
context_size_default, ngl_default, temperature_default);
|
||||
context_size_default, ngl_default, temperature_default, n_threads_default);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -891,7 +936,7 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama
|
||||
// Function to tokenize the prompt
|
||||
static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
|
||||
std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
|
||||
const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0;
|
||||
const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0;
|
||||
|
||||
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
||||
prompt_tokens.resize(n_prompt_tokens);
|
||||
@@ -907,7 +952,7 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
|
||||
// Check if we have enough space in the context to evaluate this batch
|
||||
static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
|
||||
const int n_ctx = llama_n_ctx(ctx.get());
|
||||
const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
|
||||
const int n_ctx_used = llama_kv_self_used_cells(ctx.get());
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf(LOG_COL_DEFAULT "\n");
|
||||
printe("context size exceeded\n");
|
||||
|
||||
@@ -15,7 +15,7 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
print_build_info();
|
||||
common_init();
|
||||
|
||||
if (params.n_predict < 0) {
|
||||
params.n_predict = 16;
|
||||
@@ -196,7 +196,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||
|
||||
// erase whole kv
|
||||
llama_kv_cache_clear(ctx3);
|
||||
llama_kv_self_clear(ctx3);
|
||||
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
||||
|
||||
// restore kv into seq 1
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
|
||||
const SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}';
|
||||
const SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}';
|
||||
|
||||
function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
|
||||
if (minItems === 0 && maxItems === 1) {
|
||||
|
||||
@@ -131,9 +131,9 @@ struct slot_params {
|
||||
lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
|
||||
}
|
||||
|
||||
std::vector<std::string> grammar_trigger_words;
|
||||
for (const auto & trigger : sampling.grammar_trigger_words) {
|
||||
grammar_trigger_words.push_back(trigger.word);
|
||||
auto grammar_triggers = json::array();
|
||||
for (const auto & trigger : sampling.grammar_triggers) {
|
||||
grammar_triggers.push_back(trigger.to_json<json>());
|
||||
}
|
||||
|
||||
return json {
|
||||
@@ -170,8 +170,8 @@ struct slot_params {
|
||||
{"n_probs", sampling.n_probs},
|
||||
{"min_keep", sampling.min_keep},
|
||||
{"grammar", sampling.grammar},
|
||||
{"grammar_trigger_words", grammar_trigger_words},
|
||||
{"grammar_trigger_tokens", sampling.grammar_trigger_tokens},
|
||||
{"grammar_lazy", sampling.grammar_lazy},
|
||||
{"grammar_triggers", grammar_triggers},
|
||||
{"preserved_tokens", sampling.preserved_tokens},
|
||||
{"chat_format", common_chat_format_name(oaicompat_chat_format)},
|
||||
{"samplers", samplers},
|
||||
@@ -356,24 +356,6 @@ struct server_task {
|
||||
}
|
||||
|
||||
{
|
||||
const auto grammar_triggers = data.find("grammar_triggers");
|
||||
if (grammar_triggers != data.end()) {
|
||||
for (const auto & t : *grammar_triggers) {
|
||||
common_grammar_trigger trigger;
|
||||
trigger.word = t.at("word");
|
||||
trigger.at_start = t.at("at_start");
|
||||
|
||||
auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
|
||||
if (ids.size() == 1) {
|
||||
SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
|
||||
params.sampling.grammar_trigger_tokens.push_back(ids[0]);
|
||||
params.sampling.preserved_tokens.insert(ids[0]);
|
||||
continue;
|
||||
}
|
||||
SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
|
||||
params.sampling.grammar_trigger_words.push_back(trigger);
|
||||
}
|
||||
}
|
||||
const auto preserved_tokens = data.find("preserved_tokens");
|
||||
if (preserved_tokens != data.end()) {
|
||||
for (const auto & t : *preserved_tokens) {
|
||||
@@ -383,12 +365,39 @@ struct server_task {
|
||||
params.sampling.preserved_tokens.insert(ids[0]);
|
||||
} else {
|
||||
// This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
|
||||
SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
|
||||
SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (params.sampling.grammar_lazy) {
|
||||
GGML_ASSERT(params.sampling.grammar_trigger_tokens.size() > 0 || params.sampling.grammar_trigger_words.size() > 0);
|
||||
const auto grammar_triggers = data.find("grammar_triggers");
|
||||
if (grammar_triggers != data.end()) {
|
||||
for (const auto & t : *grammar_triggers) {
|
||||
auto ct = common_grammar_trigger::from_json(t);
|
||||
if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
||||
const auto & word = ct.value;
|
||||
auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
|
||||
if (ids.size() == 1) {
|
||||
auto token = ids[0];
|
||||
if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
|
||||
throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
|
||||
}
|
||||
SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
|
||||
common_grammar_trigger trigger;
|
||||
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
|
||||
trigger.value = word;
|
||||
trigger.token = token;
|
||||
params.sampling.grammar_triggers.push_back(std::move(trigger));
|
||||
} else {
|
||||
SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
|
||||
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
|
||||
}
|
||||
} else {
|
||||
params.sampling.grammar_triggers.push_back(ct);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
|
||||
throw std::runtime_error("Error: no triggers set for lazy grammar!");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -742,7 +751,10 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
{"name", tc.name},
|
||||
{"arguments", tc.arguments},
|
||||
}},
|
||||
{"id", tc.id},
|
||||
// Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
|
||||
// We only generate a random id for the ones that don't generate one by themselves
|
||||
// (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
|
||||
{"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
|
||||
});
|
||||
}
|
||||
message["tool_calls"] = tool_calls;
|
||||
@@ -1304,7 +1316,7 @@ struct server_slot {
|
||||
return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
|
||||
}
|
||||
|
||||
bool can_batch_with(server_slot & other_slot) {
|
||||
bool can_batch_with(server_slot & other_slot) const {
|
||||
return is_non_causal() == other_slot.is_non_causal()
|
||||
&& are_lora_equal(lora, other_slot.lora);
|
||||
}
|
||||
@@ -1892,6 +1904,7 @@ struct server_context {
|
||||
try {
|
||||
common_chat_format_example(chat_templates.get(), params.use_jinja);
|
||||
} catch (const std::exception & e) {
|
||||
SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
|
||||
SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
||||
chat_templates = common_chat_templates_init(model, "chatml");
|
||||
}
|
||||
@@ -2027,6 +2040,18 @@ struct server_context {
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
|
||||
for (const auto & token : tokens) {
|
||||
if (token < 0 || token >= n_vocab) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
|
||||
slot.reset();
|
||||
slot.id_task = task.id;
|
||||
@@ -2041,11 +2066,16 @@ struct server_context {
|
||||
slot.lora = task.params.lora;
|
||||
}
|
||||
|
||||
bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
|
||||
if (!can_detokenize) {
|
||||
send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
|
||||
return false;
|
||||
}
|
||||
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
|
||||
|
||||
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
||||
// Might be better to reject the request with a 400 ?
|
||||
SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict);
|
||||
SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d\n", slot.params.n_predict, slot.n_predict);
|
||||
slot.params.n_predict = slot.n_predict;
|
||||
}
|
||||
|
||||
@@ -2083,7 +2113,7 @@ struct server_context {
|
||||
SRV_DBG("%s", "clearing KV cache\n");
|
||||
|
||||
// clear the entire KV cache
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_kv_self_clear(ctx);
|
||||
clean_kv_cache = false;
|
||||
}
|
||||
|
||||
@@ -2148,14 +2178,6 @@ struct server_context {
|
||||
}
|
||||
|
||||
if (slot.has_new_line) {
|
||||
// if we have already seen a new line, we stop after a certain time limit
|
||||
if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
|
||||
slot.stop = STOP_TYPE_LIMIT;
|
||||
slot.has_next_token = false;
|
||||
|
||||
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
|
||||
}
|
||||
|
||||
// require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
|
||||
if (slot.params.n_indent > 0) {
|
||||
// check the current indentation
|
||||
@@ -2194,6 +2216,14 @@ struct server_context {
|
||||
// check if there is a new line in the generated text
|
||||
if (result.text_to_send.find('\n') != std::string::npos) {
|
||||
slot.has_new_line = true;
|
||||
|
||||
// if we have seen a new line, we stop after a certain time limit, but only upon another new line
|
||||
if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
|
||||
slot.stop = STOP_TYPE_LIMIT;
|
||||
slot.has_next_token = false;
|
||||
|
||||
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
|
||||
}
|
||||
}
|
||||
|
||||
// if context shift is disabled, we stop when it reaches the context limit
|
||||
@@ -2625,8 +2655,8 @@ struct server_context {
|
||||
res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
|
||||
res->t_start = metrics.t_start;
|
||||
|
||||
res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
|
||||
res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx);
|
||||
res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
|
||||
res->kv_cache_used_cells = llama_kv_self_used_cells(ctx);
|
||||
|
||||
res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
|
||||
res->t_prompt_processing_total = metrics.t_prompt_processing_total;
|
||||
@@ -2742,7 +2772,7 @@ struct server_context {
|
||||
|
||||
// Erase token cache
|
||||
const size_t n_erased = slot->cache_tokens.size();
|
||||
llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
|
||||
llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
|
||||
slot->cache_tokens.clear();
|
||||
|
||||
auto res = std::make_unique<server_task_result_slot_erase>();
|
||||
@@ -2810,8 +2840,8 @@ struct server_context {
|
||||
|
||||
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
|
||||
llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
|
||||
|
||||
if (slot.params.cache_prompt) {
|
||||
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
|
||||
@@ -3002,8 +3032,8 @@ struct server_context {
|
||||
|
||||
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift);
|
||||
llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
|
||||
llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
|
||||
|
||||
for (size_t i = 0; i < n_match; i++) {
|
||||
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
|
||||
@@ -3041,9 +3071,9 @@ struct server_context {
|
||||
}
|
||||
|
||||
// keep only the common part
|
||||
if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
|
||||
if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
|
||||
// could not partially delete (likely using a non-Transformer model)
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
|
||||
llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
|
||||
|
||||
// there is no common part left
|
||||
slot.n_past = 0;
|
||||
@@ -3283,7 +3313,7 @@ struct server_context {
|
||||
slot.cache_tokens.push_back(id);
|
||||
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
|
||||
llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
|
||||
|
||||
for (size_t i = 0; i < ids.size(); ++i) {
|
||||
completion_token_output result;
|
||||
|
||||
@@ -144,6 +144,7 @@ def test_apply_chat_template():
|
||||
@pytest.mark.parametrize("response_format,n_predicted,re_content", [
|
||||
({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
|
||||
({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),
|
||||
({"type": "json_schema", "json_schema": {"schema": {"const": "foooooo"}}}, 10, "\"foooooo\""),
|
||||
({"type": "json_object"}, 10, "(\\{|John)+"),
|
||||
({"type": "sound"}, 0, None),
|
||||
# invalid response format (expected to fail)
|
||||
|
||||
239
examples/server/tests/unit/test_tool_call.py
Normal file → Executable file
239
examples/server/tests/unit/test_tool_call.py
Normal file → Executable file
@@ -1,4 +1,12 @@
|
||||
#!/usr/bin/env python
|
||||
import pytest
|
||||
|
||||
# ensure grandparent path is in sys.path
|
||||
from pathlib import Path
|
||||
import sys
|
||||
path = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(path))
|
||||
|
||||
from utils import *
|
||||
|
||||
server: ServerProcess
|
||||
@@ -66,15 +74,8 @@ WEATHER_TOOL = {
|
||||
}
|
||||
|
||||
|
||||
def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None):
|
||||
global server
|
||||
n_predict = 512
|
||||
# server = ServerPreset.stories15m_moe()
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs):
|
||||
res = server.make_request("POST", "/v1/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a coding assistant."},
|
||||
@@ -83,16 +84,15 @@ def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, a
|
||||
"tool_choice": "required",
|
||||
"tools": [tool],
|
||||
"parallel_tool_calls": False,
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"top_p": 1.0,
|
||||
**kwargs,
|
||||
})
|
||||
assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
|
||||
choice = res.body["choices"][0]
|
||||
tool_calls = choice["message"].get("tool_calls")
|
||||
assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
|
||||
tool_call = tool_calls[0]
|
||||
assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
|
||||
assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
||||
assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
|
||||
expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
|
||||
assert expected_function_name == tool_call["function"]["name"]
|
||||
actual_arguments = tool_call["function"]["arguments"]
|
||||
@@ -108,7 +108,14 @@ def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, a
|
||||
("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"),
|
||||
])
|
||||
def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None):
|
||||
do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
|
||||
global server
|
||||
n_predict = 512
|
||||
# server = ServerPreset.stories15m_moe()
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, temperature=0.0, top_k=1, top_p=1.0)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@@ -130,10 +137,17 @@ def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict,
|
||||
("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", TEST_TOOL, "success"),
|
||||
("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", PYTHON_TOOL, "code"),
|
||||
("fireworks-ai-llama-3-firefunction-v2", TEST_TOOL, "success"),
|
||||
("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"),
|
||||
# ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"),
|
||||
])
|
||||
def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None):
|
||||
do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
|
||||
global server
|
||||
n_predict = 512
|
||||
# server = ServerPreset.stories15m_moe()
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@@ -142,25 +156,33 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
|
||||
(PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
# Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
|
||||
(TEST_TOOL, "success", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
|
||||
(PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
|
||||
# (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
|
||||
(PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
|
||||
(PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
|
||||
# (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
|
||||
(PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
|
||||
@@ -176,10 +198,10 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
|
||||
(PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
|
||||
# (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
# TODO: fix these
|
||||
# (TEST_TOOL, "success", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
# (PYTHON_TOOL, "code", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
])
|
||||
def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
|
||||
global server
|
||||
@@ -197,7 +219,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
res = server.make_request("POST", "/v1/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a coding assistant."},
|
||||
@@ -215,7 +237,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
|
||||
tool_calls = choice["message"].get("tool_calls")
|
||||
assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
|
||||
tool_call = tool_calls[0]
|
||||
assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
|
||||
# assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
||||
expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
|
||||
assert expected_function_name == tool_call["function"]["name"]
|
||||
actual_arguments = tool_call["function"]["arguments"]
|
||||
@@ -225,13 +247,8 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
|
||||
assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
|
||||
|
||||
|
||||
def do_test_completion_without_tool_call(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
|
||||
global server
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
def do_test_completion_without_tool_call(server: ServerProcess, n_predict: int, tools: list[dict], tool_choice: str | None, **kwargs):
|
||||
res = server.make_request("POST", "/v1/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a coding assistant."},
|
||||
@@ -239,9 +256,7 @@ def do_test_completion_without_tool_call(template_name: str, n_predict: int, too
|
||||
],
|
||||
"tools": tools if tools else None,
|
||||
"tool_choice": tool_choice,
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"top_p": 1.0,
|
||||
**kwargs,
|
||||
}, timeout=TIMEOUT_HTTP_REQUEST)
|
||||
assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
|
||||
choice = res.body["choices"][0]
|
||||
@@ -254,7 +269,12 @@ def do_test_completion_without_tool_call(template_name: str, n_predict: int, too
|
||||
("meta-llama-Llama-3.3-70B-Instruct", 128, [PYTHON_TOOL], 'none'),
|
||||
])
|
||||
def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
|
||||
do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
|
||||
global server
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
do_test_completion_without_tool_call(server, n_predict, tools, tool_choice)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@@ -270,7 +290,12 @@ def test_completion_without_tool_call_fast(template_name: str, n_predict: int, t
|
||||
("meta-llama-Llama-3.2-3B-Instruct", 256, [PYTHON_TOOL], 'none'),
|
||||
])
|
||||
def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
|
||||
do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
|
||||
global server
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
do_test_completion_without_tool_call(server, n_predict, tools, tool_choice)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@@ -281,6 +306,12 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
|
||||
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None),
|
||||
("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
|
||||
("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
|
||||
("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
@@ -324,48 +355,53 @@ def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] |
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
do_test_weather(server, max_tokens=n_predict)
|
||||
|
||||
|
||||
def do_test_weather(server: ServerProcess, **kwargs):
|
||||
res = server.make_request("POST", "/v1/chat/completions", data={
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
|
||||
{"role": "user", "content": "What is the weather in Istanbul?"},
|
||||
],
|
||||
"tools": [WEATHER_TOOL],
|
||||
**kwargs,
|
||||
}, timeout=TIMEOUT_HTTP_REQUEST)
|
||||
assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
|
||||
choice = res.body["choices"][0]
|
||||
tool_calls = choice["message"].get("tool_calls")
|
||||
assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
|
||||
tool_call = tool_calls[0]
|
||||
assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
|
||||
assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"]
|
||||
# assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
||||
assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"], f'Expected weather tool call, got {tool_call["function"]["name"]}'
|
||||
assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
|
||||
actual_arguments = json.loads(tool_call["function"]["arguments"])
|
||||
assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
|
||||
location = actual_arguments["location"]
|
||||
assert isinstance(location, str), f"Expected location to be a string, got {type(location)}: {json.dumps(location)}"
|
||||
assert re.match('^Istanbul(, (TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}'
|
||||
assert re.match('^Istanbul(( |, ?)(TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}'
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [
|
||||
(None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
|
||||
(None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
|
||||
(None, 128, "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
|
||||
(None, 128, "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
(None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
(None, 128, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
|
||||
(None, 128, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
|
||||
(None, 128, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
|
||||
(None, 128, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
|
||||
(None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
|
||||
(None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
|
||||
(None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
|
||||
|
||||
# TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value)
|
||||
("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
# ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
|
||||
# (None, 128, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
|
||||
# ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
])
|
||||
def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
|
||||
global server
|
||||
# n_predict = 512
|
||||
server.n_slots = 1
|
||||
server.jinja = True
|
||||
server.n_ctx = 8192 * 2
|
||||
@@ -379,10 +415,14 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str,
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
do_test_calc_result(server, result_override, n_predict)
|
||||
|
||||
|
||||
def do_test_calc_result(server: ServerProcess, result_override: str | None, n_predict: int, **kwargs):
|
||||
res = server.make_request("POST", "/v1/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things, and provide very concise answers. Do not explain your reasoning to the user. Provide any numerical values back to the user with at most two decimals."},
|
||||
{"role": "system", "content": "You are a tools-calling assistant. You express numerical values with at most two decimals."},
|
||||
{"role": "user", "content": "What's the y coordinate of a point on the unit sphere at angle 30 degrees?"},
|
||||
{
|
||||
"role": "assistant",
|
||||
@@ -423,7 +463,8 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str,
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
],
|
||||
**kwargs,
|
||||
}, timeout=TIMEOUT_HTTP_REQUEST)
|
||||
assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
|
||||
choice = res.body["choices"][0]
|
||||
@@ -434,19 +475,19 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str,
|
||||
if result_override is not None:
|
||||
assert re.match(result_override, content), f'Expected {result_override}, got {content}'
|
||||
else:
|
||||
assert re.match('^[\\s\\S]*?The (y[ -])?coordinate [\\s\\S]*?is (approximately )?0\\.56\\b|^0\\.56$', content), \
|
||||
assert re.match('^[\\s\\S]*?((That\'s|\\bis) (approximately )?)?\\b0\\.(5\\b|56\\b|556)', content), \
|
||||
f'Expected something like "The y coordinate is 0.56.", got {content}'
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("n_predict,reasoning_format,expect_content,expect_reasoning_content,hf_repo,template_override", [
|
||||
(128, 'deepseek', "^The sum of 102 and 7 is 109.*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
(128, None, "^The sum of 102 and 7 is 109.*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
(128, 'deepseek', "^The sum of 102 and 7 is 109[\\s\\S]*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
(128, None, "^The sum of 102 and 7 is 109[\\s\\S]*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
|
||||
(1024, 'deepseek', "To find the sum of.*", "I need to calculate the sum of 102 and 7.*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
(1024, 'none', "^I need[\\s\\S]*?</think>\n?To find.*", None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
(1024, 'deepseek', "To find the sum of[\\s\\S]*", "I need to calculate the sum of 102 and 7[\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
(1024, 'none', "^(<think>\\s*)?I need[\\s\\S]*?</think>\\s*To find[\\s\\S]*", None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
|
||||
(1024, 'deepseek', "To find the sum of.*", "First, I [\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
|
||||
(1024, 'deepseek', "To find the sum of[\\s\\S]*", "First, I [\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
|
||||
])
|
||||
def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | None, expect_content: str | None, expect_reasoning_content: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
|
||||
global server
|
||||
@@ -464,7 +505,7 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
res = server.make_request("POST", "/v1/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
"messages": [
|
||||
{"role": "user", "content": "What's the sum of 102 and 7?"},
|
||||
@@ -476,7 +517,7 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
|
||||
|
||||
content = choice["message"].get("content")
|
||||
if expect_content is None:
|
||||
assert content is None, f'Expected no content in {choice["message"]}'
|
||||
assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
||||
else:
|
||||
assert re.match(expect_content, content), f'Expected {expect_content}, got {content}'
|
||||
|
||||
@@ -488,46 +529,46 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [
|
||||
(None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
# (None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", "chatml"),
|
||||
@pytest.mark.parametrize("hf_repo,template_override", [
|
||||
("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
|
||||
(None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
(None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
|
||||
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)),
|
||||
(None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
|
||||
("bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)),
|
||||
("bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
|
||||
|
||||
('{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
|
||||
(None, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
# ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
|
||||
("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
|
||||
(None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
|
||||
("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", None),
|
||||
|
||||
('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
|
||||
(None, "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
|
||||
("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", None),
|
||||
|
||||
(None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
|
||||
(None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
|
||||
("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
|
||||
(None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
|
||||
("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
|
||||
("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
|
||||
(None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
|
||||
("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
|
||||
("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
|
||||
(None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
|
||||
("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
|
||||
("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
# Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
|
||||
(None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
|
||||
("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
|
||||
("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", "chatml"),
|
||||
])
|
||||
def test_hello_world(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
|
||||
def test_hello_world(hf_repo: str, template_override: str | Tuple[str, str | None] | None):
|
||||
global server
|
||||
n_predict = 512 # High because of DeepSeek R1
|
||||
server.n_slots = 1
|
||||
server.jinja = True
|
||||
server.n_ctx = 8192
|
||||
server.n_predict = 512 # High because of DeepSeek R1
|
||||
server.n_predict = n_predict
|
||||
server.model_hf_repo = hf_repo
|
||||
server.model_hf_file = None
|
||||
if isinstance(template_override, tuple):
|
||||
@@ -537,31 +578,29 @@ def test_hello_world(expected_arguments_override: str | None, hf_repo: str, temp
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": 256,
|
||||
|
||||
do_test_hello_world(server, max_tokens=n_predict)
|
||||
|
||||
|
||||
def do_test_hello_world(server: ServerProcess, **kwargs):
|
||||
res = server.make_request("POST", "/v1/chat/completions", data={
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a coding assistant."},
|
||||
{"role": "system", "content": "You are a tool-calling agent."},
|
||||
{"role": "user", "content": "say hello world with python"},
|
||||
],
|
||||
"tools": [PYTHON_TOOL],
|
||||
# Note: without these greedy params, Functionary v3.2 writes `def hello_world():\n print("Hello, World!")\nhello_world()` which is correct but a pain to test.
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"top_p": 1.0,
|
||||
**kwargs,
|
||||
}, timeout=TIMEOUT_HTTP_REQUEST)
|
||||
assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
|
||||
choice = res.body["choices"][0]
|
||||
tool_calls = choice["message"].get("tool_calls")
|
||||
assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
|
||||
tool_call = tool_calls[0]
|
||||
assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
|
||||
# assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
||||
assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
|
||||
actual_arguments = tool_call["function"]["arguments"]
|
||||
if expected_arguments_override is not None:
|
||||
assert actual_arguments == expected_arguments_override
|
||||
else:
|
||||
actual_arguments = json.loads(actual_arguments)
|
||||
assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
|
||||
code = actual_arguments["code"]
|
||||
assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}"
|
||||
assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}'
|
||||
assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
|
||||
actual_arguments = json.loads(tool_call["function"]["arguments"])
|
||||
assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
|
||||
code = actual_arguments["code"]
|
||||
assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}"
|
||||
assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}'
|
||||
|
||||
@@ -67,6 +67,9 @@ class ServerProcess:
|
||||
id_slot: int | None = None
|
||||
cache_prompt: bool | None = None
|
||||
n_slots: int | None = None
|
||||
ctk: str | None = None
|
||||
ctv: str | None = None
|
||||
fa: bool | None = None
|
||||
server_continuous_batching: bool | None = False
|
||||
server_embeddings: bool | None = False
|
||||
server_reranking: bool | None = False
|
||||
@@ -84,6 +87,7 @@ class ServerProcess:
|
||||
reasoning_format: Literal['deepseek', 'none'] | None = None
|
||||
chat_template: str | None = None
|
||||
chat_template_file: str | None = None
|
||||
server_path: str | None = None
|
||||
|
||||
# session variables
|
||||
process: subprocess.Popen | None = None
|
||||
@@ -97,7 +101,9 @@ class ServerProcess:
|
||||
self.server_port = int(os.environ["PORT"])
|
||||
|
||||
def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
|
||||
if "LLAMA_SERVER_BIN_PATH" in os.environ:
|
||||
if self.server_path is not None:
|
||||
server_path = self.server_path
|
||||
elif "LLAMA_SERVER_BIN_PATH" in os.environ:
|
||||
server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
|
||||
elif os.name == "nt":
|
||||
server_path = "../../../build/bin/Release/llama-server.exe"
|
||||
@@ -151,6 +157,12 @@ class ServerProcess:
|
||||
server_args.extend(["--ctx-size", self.n_ctx])
|
||||
if self.n_slots:
|
||||
server_args.extend(["--parallel", self.n_slots])
|
||||
if self.ctk:
|
||||
server_args.extend(["-ctk", self.ctk])
|
||||
if self.ctv:
|
||||
server_args.extend(["-ctv", self.ctv])
|
||||
if self.fa is not None:
|
||||
server_args.append("-fa")
|
||||
if self.n_predict:
|
||||
server_args.extend(["--n-predict", self.n_predict])
|
||||
if self.slot_save_path:
|
||||
@@ -184,7 +196,7 @@ class ServerProcess:
|
||||
server_args.extend(["--chat-template-file", self.chat_template_file])
|
||||
|
||||
args = [str(arg) for arg in [server_path, *server_args]]
|
||||
print(f"bench: starting server with: {' '.join(args)}")
|
||||
print(f"tests: starting server with: {' '.join(args)}")
|
||||
|
||||
flags = 0
|
||||
if "nt" == os.name:
|
||||
@@ -215,6 +227,10 @@ class ServerProcess:
|
||||
return # server is ready
|
||||
except Exception as e:
|
||||
pass
|
||||
# Check if process died
|
||||
if self.process.poll() is not None:
|
||||
raise RuntimeError(f"Server process died with return code {self.process.returncode}")
|
||||
|
||||
print(f"Waiting for server to start...")
|
||||
time.sleep(0.5)
|
||||
raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
|
||||
@@ -286,7 +302,7 @@ class ServerPreset:
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "tinyllamas/stories260K.gguf"
|
||||
server.model_alias = "tinyllama-2"
|
||||
server.n_ctx = 256
|
||||
server.n_ctx = 512
|
||||
server.n_batch = 32
|
||||
server.n_slots = 2
|
||||
server.n_predict = 64
|
||||
|
||||
@@ -435,6 +435,10 @@ static std::string gen_chatcmplid() {
|
||||
return "chatcmpl-" + random_string();
|
||||
}
|
||||
|
||||
static std::string gen_tool_call_id() {
|
||||
return random_string();
|
||||
}
|
||||
|
||||
//
|
||||
// other common utils
|
||||
//
|
||||
@@ -590,8 +594,8 @@ static json oaicompat_completion_params_parse(
|
||||
if (response_type == "json_object") {
|
||||
json_schema = json_value(response_format, "schema", json::object());
|
||||
} else if (response_type == "json_schema") {
|
||||
json json_schema = json_value(response_format, "json_schema", json::object());
|
||||
json_schema = json_value(json_schema, "schema", json::object());
|
||||
auto schema_wrapper = json_value(response_format, "json_schema", json::object());
|
||||
json_schema = json_value(schema_wrapper, "schema", json::object());
|
||||
} else if (!response_type.empty() && response_type != "text") {
|
||||
throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
|
||||
}
|
||||
@@ -607,6 +611,7 @@ static json oaicompat_completion_params_parse(
|
||||
inputs.use_jinja = use_jinja;
|
||||
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
||||
inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
||||
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
|
||||
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
||||
}
|
||||
@@ -616,14 +621,13 @@ static json oaicompat_completion_params_parse(
|
||||
|
||||
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
||||
llama_params["prompt"] = chat_params.prompt;
|
||||
llama_params["grammar"] = chat_params.grammar;
|
||||
if (!chat_params.grammar.empty()) {
|
||||
llama_params["grammar"] = chat_params.grammar;
|
||||
}
|
||||
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
||||
auto grammar_triggers = json::array();
|
||||
for (const auto & trigger : chat_params.grammar_triggers) {
|
||||
grammar_triggers.push_back({
|
||||
{"word", trigger.word},
|
||||
{"at_start", trigger.at_start},
|
||||
});
|
||||
grammar_triggers.push_back(trigger.to_json<json>());
|
||||
}
|
||||
llama_params["grammar_triggers"] = grammar_triggers;
|
||||
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
||||
|
||||
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
|
||||
auto generate = [&](const std::string & prompt) {
|
||||
std::string response;
|
||||
|
||||
const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0;
|
||||
const bool is_first = llama_kv_self_used_cells(ctx) == 0;
|
||||
|
||||
// tokenize the prompt
|
||||
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
||||
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
|
||||
while (true) {
|
||||
// check if we have enough space in the context to evaluate this batch
|
||||
int n_ctx = llama_n_ctx(ctx);
|
||||
int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
|
||||
int n_ctx_used = llama_kv_self_used_cells(ctx);
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf("\033[0m\n");
|
||||
fprintf(stderr, "context size exceeded\n");
|
||||
|
||||
@@ -217,7 +217,7 @@ int main(int argc, char ** argv) {
|
||||
{
|
||||
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
|
||||
llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
|
||||
}
|
||||
|
||||
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
||||
|
||||
@@ -420,14 +420,14 @@ int main(int argc, char ** argv) {
|
||||
{
|
||||
LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
|
||||
|
||||
llama_kv_cache_seq_keep(ctx_dft, s_keep);
|
||||
llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1);
|
||||
llama_kv_cache_seq_keep(ctx_dft, 0);
|
||||
llama_kv_self_seq_keep(ctx_dft, s_keep);
|
||||
llama_kv_self_seq_cp (ctx_dft, s_keep, 0, -1, -1);
|
||||
llama_kv_self_seq_keep(ctx_dft, 0);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
|
||||
llama_kv_cache_seq_keep(ctx_tgt, s_keep);
|
||||
llama_kv_cache_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
|
||||
llama_kv_cache_seq_keep(ctx_tgt, 0);
|
||||
llama_kv_self_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
|
||||
llama_kv_self_seq_keep(ctx_tgt, s_keep);
|
||||
llama_kv_self_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
|
||||
llama_kv_self_seq_keep(ctx_tgt, 0);
|
||||
}
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
@@ -444,7 +444,7 @@ int main(int argc, char ** argv) {
|
||||
common_batch_clear(batch_dft);
|
||||
common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||
llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||
llama_decode(ctx_dft, batch_dft);
|
||||
|
||||
@@ -503,8 +503,8 @@ int main(int argc, char ** argv) {
|
||||
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
|
||||
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
||||
llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
|
||||
llama_kv_self_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
||||
llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
|
||||
|
||||
// all previous tokens from this branch are now also part of the new branch
|
||||
for (int t = 0; t < batch_tgt.n_tokens; ++t) {
|
||||
@@ -585,9 +585,9 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// evaluate the target model on the drafted tokens
|
||||
{
|
||||
llama_kv_cache_seq_keep(ctx_tgt, 0);
|
||||
llama_kv_self_seq_keep(ctx_tgt, 0);
|
||||
for (int s = 1; s < n_seq_dft; ++s) {
|
||||
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
|
||||
llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
|
||||
}
|
||||
|
||||
// LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
||||
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
@@ -5,8 +7,6 @@
|
||||
#include "llama.h"
|
||||
#include "json.hpp"
|
||||
|
||||
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
@@ -87,11 +87,11 @@ struct wav_header {
|
||||
uint32_t data_size;
|
||||
};
|
||||
|
||||
static void save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
|
||||
static bool save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
|
||||
std::ofstream file(fname, std::ios::binary);
|
||||
if (!file) {
|
||||
LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str());
|
||||
return;
|
||||
LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
wav_header header;
|
||||
@@ -108,7 +108,7 @@ static void save_wav16(const std::string & fname, const std::vector<float> & dat
|
||||
file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
|
||||
}
|
||||
|
||||
file.close();
|
||||
return file.good();
|
||||
}
|
||||
|
||||
static void fill_hann_window(int length, bool periodic, float * output) {
|
||||
@@ -536,6 +536,7 @@ static std::string audio_data_from_speaker(json speaker, const outetts_version t
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.out_file = "output.wav";
|
||||
params.prompt = "";
|
||||
|
||||
params.n_predict = 4096;
|
||||
@@ -1060,8 +1061,6 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
|
||||
}
|
||||
#endif
|
||||
|
||||
const std::string fname = "output.wav";
|
||||
|
||||
const int n_sr = 24000; // sampling rate
|
||||
|
||||
// zero out first 0.25 seconds
|
||||
@@ -1072,11 +1071,15 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
|
||||
LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f);
|
||||
LOG_INF("%s: total time: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
|
||||
|
||||
save_wav16(fname, audio, n_sr);
|
||||
int retval = 0;
|
||||
|
||||
LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str());
|
||||
if (save_wav16(params.out_file, audio, n_sr)) {
|
||||
LOG_INF("%s: audio written to file '%s'\n", __func__, params.out_file.c_str());
|
||||
} else {
|
||||
retval = ENOENT;
|
||||
}
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
return retval;
|
||||
}
|
||||
|
||||
@@ -106,6 +106,7 @@ option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable
|
||||
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
||||
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
|
||||
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
||||
option(GGML_BMI2 "ggml: enable BMI2" ${INS_ENB})
|
||||
option(GGML_AVX512 "ggml: enable AVX512F" OFF)
|
||||
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
||||
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
|
||||
@@ -194,6 +195,8 @@ option(GGML_OPENCL "ggml: use OpenCL"
|
||||
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
|
||||
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
|
||||
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
|
||||
set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
|
||||
"gmml: OpenCL API version to target")
|
||||
|
||||
# toolchain for vulkan-shaders-gen
|
||||
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
|
||||
|
||||
26
ggml/cmake/common.cmake
Normal file
26
ggml/cmake/common.cmake
Normal file
@@ -0,0 +1,26 @@
|
||||
function(ggml_get_flags CCID CCVER)
|
||||
set(C_FLAGS "")
|
||||
set(CXX_FLAGS "")
|
||||
|
||||
if (CCID MATCHES "Clang")
|
||||
set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
|
||||
set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
|
||||
|
||||
if (
|
||||
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
||||
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
||||
)
|
||||
list(APPEND C_FLAGS -Wdouble-promotion)
|
||||
endif()
|
||||
elseif (CCID STREQUAL "GNU")
|
||||
set(C_FLAGS -Wdouble-promotion)
|
||||
set(CXX_FLAGS -Wno-array-bounds)
|
||||
|
||||
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
||||
list(APPEND CXX_FLAGS -Wextra-semi)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
||||
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
|
||||
endfunction()
|
||||
@@ -80,6 +80,7 @@ extern "C" {
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_bmi2 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_fma (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
|
||||
|
||||
@@ -2140,7 +2140,11 @@ extern "C" {
|
||||
# define GGML_RESTRICT
|
||||
# endif
|
||||
#else
|
||||
# define GGML_RESTRICT restrict
|
||||
# if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L)
|
||||
# define GGML_RESTRICT __restrict
|
||||
# else
|
||||
# define GGML_RESTRICT restrict
|
||||
# endif
|
||||
#endif
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
include(CheckCXXCompilerFlag)
|
||||
include("../cmake/common.cmake")
|
||||
|
||||
add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
|
||||
|
||||
@@ -24,33 +25,6 @@ if (NOT MSVC)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
function(ggml_get_flags CCID CCVER)
|
||||
set(C_FLAGS "")
|
||||
set(CXX_FLAGS "")
|
||||
|
||||
if (CCID MATCHES "Clang")
|
||||
set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
|
||||
set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
|
||||
|
||||
if (
|
||||
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
||||
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
||||
)
|
||||
list(APPEND C_FLAGS -Wdouble-promotion)
|
||||
endif()
|
||||
elseif (CCID STREQUAL "GNU")
|
||||
set(C_FLAGS -Wdouble-promotion)
|
||||
set(CXX_FLAGS -Wno-array-bounds)
|
||||
|
||||
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
||||
list(APPEND CXX_FLAGS -Wextra-semi)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
||||
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
if (GGML_FATAL_WARNINGS)
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
list(APPEND C_FLAGS -Werror)
|
||||
@@ -236,7 +210,7 @@ add_library(ggml
|
||||
target_link_libraries(ggml PUBLIC ggml-base)
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
target_link_libraries(ggml PRIVATE dl)
|
||||
target_link_libraries(ggml PRIVATE dl stdc++fs)
|
||||
endif()
|
||||
|
||||
function(ggml_add_backend_library backend)
|
||||
@@ -289,7 +263,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
set(GGML_CPU_TAG_NAME ${tag_name})
|
||||
# other: OPENMP LLAMAFILE CPU_HBM
|
||||
foreach (feat NATIVE
|
||||
AVX AVX2 AVX_VNNI FMA F16C
|
||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||
AMX_TILE AMX_INT8 AMX_BF16)
|
||||
set(GGML_${feat} OFF)
|
||||
@@ -309,13 +283,13 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||
endif()
|
||||
ggml_add_cpu_backend_variant(sandybridge AVX)
|
||||
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
|
||||
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
if (NOT MSVC)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
elseif (GGML_CPU)
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
#include <cctype>
|
||||
|
||||
#ifdef _WIN32
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
@@ -75,7 +76,14 @@ namespace fs = std::filesystem;
|
||||
static std::string path_str(const fs::path & path) {
|
||||
std::string u8path;
|
||||
try {
|
||||
#if defined(__cpp_lib_char8_t)
|
||||
// C++20 and later: u8string() returns std::u8string
|
||||
std::u8string u8str = path.u8string();
|
||||
u8path = std::string(reinterpret_cast<const char*>(u8str.c_str()));
|
||||
#else
|
||||
// C++17: u8string() returns std::string
|
||||
u8path = path.u8string();
|
||||
#endif
|
||||
} catch (...) {
|
||||
}
|
||||
return u8path;
|
||||
@@ -489,7 +497,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
search_paths.push_back(get_executable_path());
|
||||
search_paths.push_back(fs::current_path());
|
||||
} else {
|
||||
search_paths.push_back(user_search_path);
|
||||
search_paths.push_back(fs::u8path(user_search_path));
|
||||
}
|
||||
|
||||
int best_score = 0;
|
||||
@@ -503,9 +511,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
||||
for (const auto & entry : dir_it) {
|
||||
if (entry.is_regular_file()) {
|
||||
auto filename = entry.path().filename().native();
|
||||
auto ext = entry.path().extension().native();
|
||||
if (filename.find(file_prefix) == 0 && ext == file_extension) {
|
||||
auto filename = entry.path().filename();
|
||||
auto ext = entry.path().extension();
|
||||
if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
|
||||
dl_handle_ptr handle { dl_load_library(entry) };
|
||||
if (!handle && !silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
|
||||
@@ -536,7 +544,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
// try to load the base backend
|
||||
for (const auto & search_path : search_paths) {
|
||||
fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
|
||||
fs::path path = search_path.native() + filename.native();
|
||||
fs::path path = search_path / filename;
|
||||
if (fs::exists(path)) {
|
||||
return get_reg().load_backend(path, silent);
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <sys/types.h>
|
||||
|
||||
@@ -2790,10 +2790,14 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
|
||||
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
|
||||
output_ne_offset);
|
||||
int64_t antiquantGroupSize = 0;
|
||||
if (src0->ne[0] > QK8_0) {
|
||||
antiquantGroupSize = QK8_0;
|
||||
}
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
|
||||
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
|
||||
nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor,
|
||||
&workspaceSize, &executor));
|
||||
if (workspaceAddr == nullptr) {
|
||||
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
||||
@@ -2833,7 +2837,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
||||
nullptr, nullptr, nullptr, nullptr, QK8_0,
|
||||
nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
|
||||
acl_output_tensor, &workspaceSize, &executor));
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
||||
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
|
||||
@@ -1689,11 +1689,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_MUL_MAT: {
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_Q8_0:
|
||||
// Current groupsize should not be greater than k-1 in
|
||||
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
|
||||
if (op->src[0]->ne[0] <= QK8_0) {
|
||||
return false;
|
||||
}
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_Q4_0:
|
||||
|
||||
@@ -219,6 +219,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_AVX_VNNI)
|
||||
list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
|
||||
endif()
|
||||
if (GGML_BMI2)
|
||||
# MSVC does not define macro __BMI2__
|
||||
list(APPEND ARCH_DEFINITIONS __BMI2__ GGML_BMI2)
|
||||
endif()
|
||||
else ()
|
||||
if (GGML_NATIVE)
|
||||
list(APPEND ARCH_FLAGS -march=native)
|
||||
@@ -233,6 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
list(APPEND ARCH_FLAGS -mfma)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_FMA)
|
||||
endif()
|
||||
if (GGML_BMI2)
|
||||
list(APPEND ARCH_FLAGS -mbmi2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_BMI2)
|
||||
endif()
|
||||
if (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS -mavx)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX)
|
||||
|
||||
@@ -278,6 +278,10 @@ static int ggml_backend_cpu_x86_score() {
|
||||
if (!is.SSE42()) { return 0; }
|
||||
score += 1<<2;
|
||||
#endif
|
||||
#ifdef GGML_BMI2
|
||||
if (!is.BMI2()) { return 0; }
|
||||
score += 1<<3;
|
||||
#endif
|
||||
#ifdef GGML_AVX
|
||||
if (!is.AVX()) { return 0; }
|
||||
score += 1<<4;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -247,9 +247,9 @@ typedef pthread_t ggml_thread_t;
|
||||
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
||||
|
||||
|
||||
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
|
||||
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
|
||||
static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
|
||||
static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
|
||||
static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
|
||||
static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
|
||||
|
||||
static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
[GGML_TYPE_F32] = {
|
||||
@@ -1451,7 +1451,7 @@ inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
|
||||
static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
@@ -1494,7 +1494,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc) {
|
||||
static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
@@ -1562,7 +1562,7 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
|
||||
static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
@@ -1606,10 +1606,10 @@ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t *
|
||||
|
||||
// compute GGML_VEC_DOT_UNROLL dot products at once
|
||||
// xs - x row stride in bytes
|
||||
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
|
||||
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
|
||||
ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
|
||||
|
||||
ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
|
||||
ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
|
||||
|
||||
for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
|
||||
x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
|
||||
@@ -1659,7 +1659,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
|
||||
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
|
||||
#if defined(GGML_SIMD)
|
||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||
|
||||
@@ -1690,7 +1690,7 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
|
||||
#endif
|
||||
}
|
||||
|
||||
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
|
||||
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
|
||||
#if defined(GGML_SIMD)
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
@@ -1722,10 +1722,10 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const
|
||||
}
|
||||
|
||||
// xs and vs are byte strides of x and v
|
||||
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
|
||||
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
|
||||
|
||||
const float * restrict x[GGML_VEC_MAD_UNROLL];
|
||||
const float * restrict v[GGML_VEC_MAD_UNROLL];
|
||||
const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
|
||||
const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
|
||||
|
||||
for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
|
||||
x[i] = (const float *) ((const char *) xv + i*xs);
|
||||
@@ -6648,6 +6648,135 @@ static void ggml_compute_forward_repeat_back(
|
||||
|
||||
// ggml_compute_forward_concat
|
||||
|
||||
static void ggml_compute_forward_concat_any(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
const size_t len = ggml_type_size(src0->type);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
GGML_ASSERT(dim >= 0 && dim < 4);
|
||||
|
||||
int64_t o[4] = {0, 0, 0, 0};
|
||||
o[dim] = src0->ne[dim];
|
||||
|
||||
const char * x;
|
||||
|
||||
// TODO: smarter multi-theading
|
||||
for (int i3 = 0; i3 < ne3; i3++) {
|
||||
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
||||
for (int i1 = 0; i1 < ne1; i1++) {
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03;
|
||||
} else {
|
||||
x = (const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13;
|
||||
}
|
||||
|
||||
char * y = (char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3;
|
||||
|
||||
memcpy(y, x, len);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_concat_i8(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(ggml_type_size(src0->type) == sizeof(int8_t));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
GGML_ASSERT(dim >= 0 && dim < 4);
|
||||
|
||||
int64_t o[4] = {0, 0, 0, 0};
|
||||
o[dim] = src0->ne[dim];
|
||||
|
||||
const int8_t * x;
|
||||
|
||||
// TODO: smarter multi-theading
|
||||
for (int i3 = 0; i3 < ne3; i3++) {
|
||||
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
||||
for (int i1 = 0; i1 < ne1; i1++) {
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (const int8_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
|
||||
} else {
|
||||
x = (const int8_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
|
||||
}
|
||||
|
||||
int8_t * y = (int8_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_concat_f16(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(ggml_type_size(src0->type) == sizeof(ggml_fp16_t));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
GGML_ASSERT(dim >= 0 && dim < 4);
|
||||
|
||||
int64_t o[4] = {0, 0, 0, 0};
|
||||
o[dim] = src0->ne[dim];
|
||||
|
||||
const ggml_fp16_t * x;
|
||||
|
||||
// TODO: smarter multi-theading
|
||||
for (int i3 = 0; i3 < ne3; i3++) {
|
||||
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
||||
for (int i1 = 0; i1 < ne1; i1++) {
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (const ggml_fp16_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
|
||||
} else {
|
||||
x = (const ggml_fp16_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
|
||||
}
|
||||
|
||||
ggml_fp16_t * y = (ggml_fp16_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_concat_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
@@ -6655,7 +6784,7 @@ static void ggml_compute_forward_concat_f32(
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(ggml_type_size(src0->type) == sizeof(float));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
@@ -6698,6 +6827,16 @@ static void ggml_compute_forward_concat(
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_I16:
|
||||
{
|
||||
ggml_compute_forward_concat_f16(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_I8:
|
||||
{
|
||||
ggml_compute_forward_concat_i8(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_I32:
|
||||
{
|
||||
@@ -6705,7 +6844,7 @@ static void ggml_compute_forward_concat(
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
ggml_compute_forward_concat_any(params, dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -15440,6 +15579,14 @@ int ggml_cpu_has_amx_int8(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_bmi2(void) {
|
||||
#if defined(__BMI2__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_fma(void) {
|
||||
#if defined(__FMA__)
|
||||
return 1;
|
||||
|
||||
@@ -511,6 +511,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
if (ggml_cpu_has_fma()) {
|
||||
features.push_back({ "FMA", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_bmi2()) {
|
||||
features.push_back({ "BMI2", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx512()) {
|
||||
features.push_back({ "AVX512", "1" });
|
||||
}
|
||||
|
||||
@@ -395,11 +395,11 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
||||
|
||||
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
|
||||
#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
|
||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||
#elif defined(RDNA3)
|
||||
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
||||
#elif defined(__gfx1010__) || defined(__gfx900__)
|
||||
#elif defined(RDNA1) || defined(__gfx900__)
|
||||
int tmp1;
|
||||
int tmp2;
|
||||
asm("\n \
|
||||
|
||||
@@ -52,12 +52,11 @@ typedef half (*vec_dot_KQ_f16_t)(
|
||||
typedef float (*vec_dot_KQ_f32_t)(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
|
||||
|
||||
template<typename T, int D>
|
||||
template<typename T, int D, int warp_size>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
T sum = 0.0f;
|
||||
@@ -93,12 +92,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||
return sum;
|
||||
}
|
||||
|
||||
template<typename T, int D>
|
||||
template<typename T, int D, int warp_size>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
T sum = 0.0f;
|
||||
@@ -138,12 +136,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||
return sum;
|
||||
}
|
||||
|
||||
template<typename T, int D>
|
||||
template<typename T, int D, int warp_size>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
T sum = 0.0f;
|
||||
@@ -186,12 +183,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||
return sum;
|
||||
}
|
||||
|
||||
template<typename T, int D>
|
||||
template<typename T, int D, int warp_size>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
T sum = 0.0f;
|
||||
@@ -238,12 +234,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||
return sum;
|
||||
}
|
||||
|
||||
template <typename T, int D>
|
||||
template <typename T, int D, int warp_size>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
T sum = 0.0f;
|
||||
@@ -272,12 +267,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||
return sum;
|
||||
}
|
||||
|
||||
template <typename T, int D>
|
||||
template <typename T, int D, int warp_size>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const half2 * K_h2 = (const half2 *) K_c;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
|
||||
@@ -480,25 +474,25 @@ static __device__ __forceinline__ T dequantize_1_f16(const void * __restrict__ v
|
||||
return x[i];
|
||||
}
|
||||
|
||||
template <int D>
|
||||
template <int D, int warp_size = WARP_SIZE>
|
||||
constexpr __device__ vec_dot_KQ_f16_t get_vec_dot_KQ_f16(ggml_type type_K) {
|
||||
return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<half, D> :
|
||||
type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<half, D> :
|
||||
type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<half, D> :
|
||||
type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<half, D> :
|
||||
type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<half, D> :
|
||||
type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<half, D> :
|
||||
return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<half, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<half, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<half, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<half, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<half, D, warp_size> :
|
||||
type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<half, D, warp_size> :
|
||||
nullptr;
|
||||
}
|
||||
|
||||
template <int D>
|
||||
template <int D, int warp_size = WARP_SIZE>
|
||||
constexpr __device__ vec_dot_KQ_f32_t get_vec_dot_KQ_f32(ggml_type type_K) {
|
||||
return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<float, D> :
|
||||
type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<float, D> :
|
||||
type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<float, D> :
|
||||
type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<float, D> :
|
||||
type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<float, D> :
|
||||
type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<float, D> :
|
||||
return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<float, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<float, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<float, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<float, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<float, D, warp_size> :
|
||||
type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<float, D, warp_size> :
|
||||
nullptr;
|
||||
}
|
||||
|
||||
@@ -681,7 +675,8 @@ static void on_no_fattn_vec_case(const int D) {
|
||||
template <int D, int ncols1, int ncols2, int parallel_blocks, int KQ_stride>
|
||||
void launch_fattn(
|
||||
ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel,
|
||||
const int nwarps, const size_t nbytes_shared, const bool need_f16_K, const bool need_f16_V
|
||||
const int nwarps, const size_t nbytes_shared, const bool need_f16_K, const bool need_f16_V,
|
||||
const int warp_size = WARP_SIZE
|
||||
) {
|
||||
constexpr int ncols = ncols1 * ncols2;
|
||||
|
||||
@@ -704,8 +699,6 @@ void launch_fattn(
|
||||
|
||||
GGML_ASSERT(Q->ne[3] == 1);
|
||||
|
||||
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
|
||||
|
||||
ggml_cuda_pool & pool = ctx.pool();
|
||||
cudaStream_t main_stream = ctx.stream();
|
||||
const int id = ggml_cuda_get_device();
|
||||
@@ -805,7 +798,6 @@ void launch_fattn(
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
GGML_ASSERT(block_dim.x % warp_size == 0);
|
||||
GGML_ASSERT(!GGML_CUDA_CC_IS_AMD(cc) || block_dim.x * block_dim.y <= 4 * (unsigned int)warp_size);
|
||||
fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
|
||||
(const char *) Q->data,
|
||||
K_data,
|
||||
|
||||
@@ -469,6 +469,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
|
||||
constexpr int frag_m = cols_per_block == 8 && D % 32 == 0 ? 32 : 16;
|
||||
const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
|
||||
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
||||
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
|
||||
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
@@ -485,7 +486,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
|
||||
fattn_kernel = flash_attn_ext_f16<
|
||||
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
|
||||
}
|
||||
launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
|
||||
launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true, warp_size);
|
||||
return;
|
||||
}
|
||||
if (2*blocks_num_pb1 < 2*nsm) {
|
||||
@@ -500,7 +501,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
|
||||
fattn_kernel = flash_attn_ext_f16<
|
||||
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
|
||||
}
|
||||
launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
|
||||
launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true, warp_size);
|
||||
return;
|
||||
}
|
||||
constexpr int parallel_blocks = 1;
|
||||
@@ -514,7 +515,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
|
||||
fattn_kernel = flash_attn_ext_f16<
|
||||
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
|
||||
}
|
||||
launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
|
||||
launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true, warp_size);
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
@@ -310,7 +310,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
}
|
||||
|
||||
// The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
|
||||
if (cc == GGML_CUDA_CC_VOLTA) {
|
||||
if (fp16_mma_available(cc) && !new_mma_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -2571,7 +2571,7 @@ static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vecto
|
||||
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
||||
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
|
||||
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
||||
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
||||
*(void**)cuda_ctx->cuda_graph->params[i].kernelParams[1] = *(void**)updated_kernel_arg_ptr;
|
||||
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,11 +47,89 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
||||
1;
|
||||
}
|
||||
|
||||
enum mmvq_parameter_table_id {
|
||||
MMVQ_PARAMETERS_GENERIC = 0,
|
||||
MMVQ_PARAMETERS_GCN,
|
||||
MMVQ_PARAMETERS_RDNA2
|
||||
};
|
||||
|
||||
static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
|
||||
#if defined(RDNA2) || defined(RDNA3)
|
||||
return MMVQ_PARAMETERS_RDNA2;
|
||||
#elif defined(GCN) || defined(CDNA)
|
||||
return MMVQ_PARAMETERS_GCN;
|
||||
#else
|
||||
return MMVQ_PARAMETERS_GENERIC;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
|
||||
if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) {
|
||||
return MMVQ_PARAMETERS_RDNA2;
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||
return MMVQ_PARAMETERS_GCN;
|
||||
}
|
||||
return MMVQ_PARAMETERS_GENERIC;
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_parameter_table_id table_id) {
|
||||
if (table_id == MMVQ_PARAMETERS_GENERIC) {
|
||||
switch (ncols_y) {
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
return 4;
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
case 8:
|
||||
return 2;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
} else if (table_id == MMVQ_PARAMETERS_GCN) {
|
||||
switch (ncols_y) {
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
return 2;
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
case 8:
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int table_id) {
|
||||
if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) {
|
||||
switch (ncols_y) {
|
||||
case 1:
|
||||
return 1;
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
case 8:
|
||||
return 2;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
template <ggml_type type, int ncols_y>
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
||||
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(calc_nwarps(ncols_y, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
|
||||
static __global__ void mul_mat_vec_q(
|
||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
|
||||
@@ -59,24 +137,20 @@ static __global__ void mul_mat_vec_q(
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||
constexpr int vdr = get_vdr_mmvq(type);
|
||||
constexpr mmvq_parameter_table_id table_id = get_device_table_id();
|
||||
constexpr int nwarps = calc_nwarps(ncols_y, table_id);
|
||||
constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_y, table_id);
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
|
||||
constexpr int nwarps = 1;
|
||||
constexpr int rows_per_cuda_block = 1;
|
||||
#else
|
||||
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
|
||||
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
|
||||
|
||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||
const int tid = warp_size*threadIdx.y + threadIdx.x;
|
||||
const int row0 = rows_per_cuda_block*blockIdx.x;
|
||||
const int blocks_per_row_x = ncols_x / qk;
|
||||
const int blocks_per_col_y = nrows_y / QK8_1;
|
||||
constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
|
||||
constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
|
||||
|
||||
// partial sum for each thread
|
||||
// partial sum for each thread
|
||||
float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
|
||||
|
||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||
@@ -96,7 +170,7 @@ static __global__ void mul_mat_vec_q(
|
||||
}
|
||||
}
|
||||
|
||||
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
|
||||
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][warp_size];
|
||||
if (threadIdx.y > 0) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_y; ++j) {
|
||||
@@ -120,7 +194,7 @@ static __global__ void mul_mat_vec_q(
|
||||
for (int l = 0; l < nwarps-1; ++l) {
|
||||
tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
|
||||
}
|
||||
tmp[j][i] = warp_reduce_sum(tmp[j][i]);
|
||||
tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
|
||||
}
|
||||
|
||||
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
|
||||
@@ -129,6 +203,13 @@ static __global__ void mul_mat_vec_q(
|
||||
}
|
||||
}
|
||||
|
||||
static std::pair<dim3, dim3> calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) {
|
||||
const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_y, table_id) - 1) / calc_rows_per_block(ncols_y, table_id);
|
||||
const dim3 block_nums(nblocks, 1, 1);
|
||||
const dim3 block_dims(warp_size, calc_nwarps(ncols_y, table_id), 1);
|
||||
return {block_nums, block_dims};
|
||||
}
|
||||
|
||||
template <ggml_type type>
|
||||
static void mul_mat_vec_q_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
@@ -137,65 +218,67 @@ static void mul_mat_vec_q_cuda(
|
||||
GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
|
||||
GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
|
||||
|
||||
int id = ggml_cuda_get_device();
|
||||
|
||||
int64_t nwarps = 1;
|
||||
int64_t rows_per_cuda_block = 1;
|
||||
|
||||
if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_RDNA2) { // NVIDIA and AMD older than RDNA2
|
||||
switch(ncols_y) {
|
||||
case 1:
|
||||
nwarps = 4;
|
||||
rows_per_cuda_block = 1;
|
||||
break;
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
nwarps = 4;
|
||||
rows_per_cuda_block = 2;
|
||||
break;
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
case 8:
|
||||
nwarps = 2;
|
||||
rows_per_cuda_block = 2;
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
|
||||
const dim3 block_nums(nblocks, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
||||
const int device = ggml_cuda_get_device();
|
||||
const int warp_size = ggml_cuda_info().devices[device].warp_size;
|
||||
const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc);
|
||||
|
||||
switch (ncols_y) {
|
||||
case 1:
|
||||
mul_mat_vec_q<type, 1><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 1;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
case 2:
|
||||
mul_mat_vec_q<type, 2><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 2;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
case 3:
|
||||
mul_mat_vec_q<type, 3><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 3;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
case 4:
|
||||
mul_mat_vec_q<type, 4><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 4;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
case 5:
|
||||
mul_mat_vec_q<type, 5><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 5;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
case 6:
|
||||
mul_mat_vec_q<type, 6><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 6;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
case 7:
|
||||
mul_mat_vec_q<type, 7><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 7;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
case 8:
|
||||
mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
{
|
||||
constexpr int c_ncols_y = 8;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
|
||||
@@ -27,12 +27,12 @@ configure_file(../ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
|
||||
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||
configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY)
|
||||
|
||||
set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h")
|
||||
if (GGML_METAL_EMBED_LIBRARY)
|
||||
enable_language(ASM)
|
||||
|
||||
add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
|
||||
|
||||
set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h")
|
||||
set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
|
||||
set(METALLIB_IMPL "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal-impl.h")
|
||||
|
||||
@@ -88,12 +88,11 @@ else()
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
|
||||
COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
|
||||
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o - |
|
||||
xcrun -sdk macosx metallib - -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
|
||||
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
|
||||
DEPENDS ggml-metal.metal ggml-common.h
|
||||
DEPENDS ggml-metal.metal ${METALLIB_COMMON}
|
||||
COMMENT "Compiling Metal kernels"
|
||||
)
|
||||
|
||||
|
||||
@@ -285,4 +285,239 @@ typedef struct {
|
||||
float eps;
|
||||
} ggml_metal_kargs_rms_norm;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
int32_t n_groups;
|
||||
float eps;
|
||||
} ggml_metal_kargs_group_norm;
|
||||
|
||||
typedef struct {
|
||||
int32_t IC;
|
||||
int32_t IL;
|
||||
int32_t K;
|
||||
int32_t s0;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
} ggml_metal_kargs_conv_transpose_1d;
|
||||
|
||||
typedef struct {
|
||||
uint64_t ofs0;
|
||||
uint64_t ofs1;
|
||||
int32_t IW;
|
||||
int32_t IH;
|
||||
int32_t CHW;
|
||||
int32_t s0;
|
||||
int32_t s1;
|
||||
int32_t p0;
|
||||
int32_t p1;
|
||||
int32_t d0;
|
||||
int32_t d1;
|
||||
int32_t N;
|
||||
int32_t KH;
|
||||
int32_t KW;
|
||||
int32_t KHW; // KH * KW, pre-computed on CPU to save GPU resources
|
||||
} ggml_metal_kargs_im2col;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
int64_t ne03;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
int64_t ne10;
|
||||
int64_t ne11;
|
||||
int64_t ne12;
|
||||
int64_t ne13;
|
||||
uint64_t nb10;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
uint64_t nb13;
|
||||
int64_t ne0;
|
||||
int64_t ne1;
|
||||
int64_t ne2;
|
||||
int64_t ne3;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
} ggml_metal_kargs_sum_rows;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
float scale;
|
||||
float max_bias;
|
||||
float m0;
|
||||
float m1;
|
||||
uint32_t n_head_log2;
|
||||
} ggml_metal_kargs_soft_max;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int n_past;
|
||||
} ggml_metal_kargs_diag_mask_inf;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
int64_t ne10;
|
||||
int64_t ne11;
|
||||
uint64_t nb10;
|
||||
uint64_t nb11;
|
||||
int64_t ne0;
|
||||
int64_t ne1;
|
||||
int64_t ne2;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
} ggml_metal_kargs_ssm_conv;
|
||||
|
||||
typedef struct {
|
||||
int64_t d_state;
|
||||
int64_t d_inner;
|
||||
int64_t n_seq_tokens;
|
||||
int64_t n_seqs;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb10;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
uint64_t nb13;
|
||||
uint64_t nb20;
|
||||
uint64_t nb21;
|
||||
uint64_t nb22;
|
||||
uint64_t nb30;
|
||||
uint64_t nb31;
|
||||
uint64_t nb40;
|
||||
uint64_t nb41;
|
||||
uint64_t nb42;
|
||||
uint64_t nb50;
|
||||
uint64_t nb51;
|
||||
uint64_t nb52;
|
||||
} ggml_metal_kargs_ssm_scan;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
int64_t ne10;
|
||||
uint64_t nb10;
|
||||
uint64_t nb11;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
} ggml_metal_kargs_get_rows;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
int64_t ne03;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
int64_t ne0;
|
||||
int64_t ne1;
|
||||
int64_t ne2;
|
||||
int64_t ne3;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
float sf0;
|
||||
float sf1;
|
||||
float sf2;
|
||||
float sf3;
|
||||
} ggml_metal_kargs_upscale;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
int64_t ne03;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
int64_t ne0;
|
||||
int64_t ne1;
|
||||
int64_t ne2;
|
||||
int64_t ne3;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
} ggml_metal_kargs_pad;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
int64_t ne03;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
int64_t ne0;
|
||||
int64_t ne1;
|
||||
int64_t ne2;
|
||||
int64_t ne3;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
int32_t p0;
|
||||
int32_t p1;
|
||||
} ggml_metal_kargs_pad_reflect_1d;
|
||||
|
||||
typedef struct {
|
||||
uint64_t nb1;
|
||||
int dim;
|
||||
int max_period;
|
||||
} ggml_metal_kargs_timestep_embedding;
|
||||
|
||||
typedef struct {
|
||||
float slope;
|
||||
} ggml_metal_kargs_leaky_relu;
|
||||
|
||||
typedef struct {
|
||||
int64_t ncols;
|
||||
int64_t ncols_pad;
|
||||
} ggml_metal_kargs_argsort;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne0;
|
||||
float start;
|
||||
float step;
|
||||
} ggml_metal_kargs_arange;
|
||||
|
||||
typedef struct {
|
||||
int32_t k0;
|
||||
int32_t k1;
|
||||
int32_t s0;
|
||||
int32_t s1;
|
||||
int32_t p0;
|
||||
int32_t p1;
|
||||
int64_t IH;
|
||||
int64_t IW;
|
||||
int64_t OH;
|
||||
int64_t OW;
|
||||
int64_t parallel_elements;
|
||||
} ggml_metal_kargs_pool_2d;
|
||||
|
||||
#endif // GGML_METAL_IMPL
|
||||
|
||||
@@ -46,6 +46,7 @@ static struct ggml_backend_device g_ggml_backend_metal_device;
|
||||
static struct ggml_backend_metal_device_context {
|
||||
id<MTLDevice> mtl_device;
|
||||
int mtl_device_ref_count;
|
||||
id<MTLLibrary> mtl_library;
|
||||
|
||||
bool has_simdgroup_reduction;
|
||||
bool has_simdgroup_mm;
|
||||
@@ -57,6 +58,7 @@ static struct ggml_backend_metal_device_context {
|
||||
} g_ggml_ctx_dev_main = {
|
||||
/*.mtl_device =*/ nil,
|
||||
/*.mtl_device_ref_count =*/ 0,
|
||||
/*.mtl_library =*/ nil,
|
||||
/*.has_simdgroup_reduction =*/ false,
|
||||
/*.has_simdgroup_mm =*/ false,
|
||||
/*.has_residency_sets =*/ false,
|
||||
@@ -108,6 +110,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
ctx->mtl_device_ref_count--;
|
||||
|
||||
if (ctx->mtl_device_ref_count == 0) {
|
||||
if (ctx->mtl_library) {
|
||||
[ctx->mtl_library release];
|
||||
ctx->mtl_library = nil;
|
||||
}
|
||||
|
||||
if (ctx->mtl_device) {
|
||||
[ctx->mtl_device release];
|
||||
ctx->mtl_device = nil;
|
||||
@@ -467,11 +474,13 @@ struct ggml_backend_metal_context {
|
||||
// for now it is easier to work in a separate file
|
||||
// static NSString * const msl_library_source = @"see metal.metal";
|
||||
|
||||
#if !GGML_METAL_EMBED_LIBRARY
|
||||
// Here to assist with NSBundle Path Hack
|
||||
@interface GGMLMetalClass : NSObject
|
||||
@end
|
||||
@implementation GGMLMetalClass
|
||||
@end
|
||||
#endif
|
||||
|
||||
static void * ggml_metal_host_malloc(size_t n) {
|
||||
void * data = NULL;
|
||||
@@ -493,6 +502,139 @@ static void * ggml_metal_host_malloc(size_t n) {
|
||||
return data;
|
||||
}
|
||||
|
||||
// load library
|
||||
//
|
||||
// - first check if the library is embedded
|
||||
// - then check if the library is in the bundle
|
||||
// - if not found, load the source and compile it
|
||||
// - if that fails, return NULL
|
||||
static id<MTLLibrary> ggml_metal_load_library(id<MTLDevice> device, bool use_bfloat) {
|
||||
id<MTLLibrary> metal_library = nil;
|
||||
NSError * error = nil;
|
||||
NSString * src = nil;
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
|
||||
|
||||
extern const char ggml_metallib_start[];
|
||||
extern const char ggml_metallib_end[];
|
||||
|
||||
src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
||||
|
||||
#else
|
||||
|
||||
#ifdef SWIFT_PACKAGE
|
||||
NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
|
||||
#else
|
||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
#endif
|
||||
|
||||
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
|
||||
if (path_lib == nil) {
|
||||
// Try to find the resource in the directory where the current binary located.
|
||||
NSString * current_binary = [[NSProcessInfo processInfo] arguments][0];
|
||||
NSString * bin_dir = [current_binary stringByDeletingLastPathComponent];
|
||||
NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
|
||||
if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
||||
GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]);
|
||||
NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error];
|
||||
if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
|
||||
// Optionally, if this is a symlink, try to resolve it.
|
||||
default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error];
|
||||
if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) {
|
||||
// It is a relative path, adding the binary directory as directory prefix.
|
||||
default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]];
|
||||
}
|
||||
if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
||||
// Link to the resource could not be resolved.
|
||||
default_metallib_path = nil;
|
||||
} else {
|
||||
GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// The resource couldn't be found in the binary's directory.
|
||||
default_metallib_path = nil;
|
||||
}
|
||||
path_lib = default_metallib_path;
|
||||
}
|
||||
|
||||
if (path_lib != nil) {
|
||||
// pre-compiled library found
|
||||
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
||||
|
||||
metal_library = [device newLibraryWithURL:libURL error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
|
||||
NSString * path_source;
|
||||
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
||||
|
||||
GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
|
||||
|
||||
if (path_resource) {
|
||||
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
|
||||
} else {
|
||||
path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
}
|
||||
|
||||
if (path_source == nil) {
|
||||
GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
||||
path_source = @"ggml-metal.metal";
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
||||
|
||||
src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!metal_library) {
|
||||
@autoreleasepool {
|
||||
// dictionary of preprocessor macros
|
||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||
|
||||
if (use_bfloat) {
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
|
||||
#endif
|
||||
|
||||
MTLCompileOptions * options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = prep;
|
||||
|
||||
//[options setFastMathEnabled:false];
|
||||
|
||||
metal_library = [device newLibraryWithSource:src options:options error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if !__has_feature(objc_arc)
|
||||
[options release];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[src release];
|
||||
#endif // GGML_METAL_EMBED_LIBRARY
|
||||
|
||||
return metal_library;
|
||||
}
|
||||
|
||||
static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t dev) {
|
||||
GGML_LOG_INFO("%s: allocating\n", __func__);
|
||||
|
||||
@@ -520,137 +662,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
|
||||
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
id<MTLLibrary> metal_library;
|
||||
|
||||
// load library
|
||||
//
|
||||
// - first check if the library is embedded
|
||||
// - then check if the library is in the bundle
|
||||
// - if not found, load the source and compile it
|
||||
// - if that fails, return NULL
|
||||
{
|
||||
NSBundle * bundle = nil;
|
||||
#ifdef SWIFT_PACKAGE
|
||||
bundle = SWIFTPM_MODULE_BUNDLE;
|
||||
#else
|
||||
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
#endif
|
||||
|
||||
NSError * error = nil;
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
const bool try_metallib = false;
|
||||
#else
|
||||
const bool try_metallib = true;
|
||||
#endif
|
||||
|
||||
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
|
||||
if (path_lib == nil) {
|
||||
// Try to find the resource in the directory where the current binary located.
|
||||
NSString * current_binary = [[NSProcessInfo processInfo] arguments][0];
|
||||
NSString * bin_dir = [current_binary stringByDeletingLastPathComponent];
|
||||
NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
|
||||
if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
||||
GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]);
|
||||
NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error];
|
||||
if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
|
||||
// Optionally, if this is a symlink, try to resolve it.
|
||||
default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error];
|
||||
if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) {
|
||||
// It is a relative path, adding the binary directory as directory prefix.
|
||||
default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]];
|
||||
}
|
||||
if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
||||
// Link to the resource could not be resolved.
|
||||
default_metallib_path = nil;
|
||||
} else {
|
||||
GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// The resource couldn't be found in the binary's directory.
|
||||
default_metallib_path = nil;
|
||||
}
|
||||
path_lib = default_metallib_path;
|
||||
}
|
||||
|
||||
if (try_metallib && path_lib != nil) {
|
||||
// pre-compiled library found
|
||||
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
||||
|
||||
metal_library = [device newLibraryWithURL:libURL error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
|
||||
|
||||
extern const char ggml_metallib_start[];
|
||||
extern const char ggml_metallib_end[];
|
||||
|
||||
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
||||
#else
|
||||
GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
|
||||
NSString * path_source;
|
||||
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
||||
|
||||
GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
|
||||
|
||||
if (path_resource) {
|
||||
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
|
||||
} else {
|
||||
path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
}
|
||||
|
||||
if (path_source == nil) {
|
||||
GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
||||
path_source = @"ggml-metal.metal";
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
||||
|
||||
NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
#endif // GGML_METAL_EMBED_LIBRARY
|
||||
|
||||
@autoreleasepool {
|
||||
// dictionary of preprocessor macros
|
||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||
|
||||
if (ctx_dev->use_bfloat) {
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
|
||||
#endif
|
||||
|
||||
MTLCompileOptions * options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = prep;
|
||||
|
||||
//[options setFastMathEnabled:false];
|
||||
|
||||
metal_library = [device newLibraryWithSource:src options:options error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if !__has_feature(objc_arc)
|
||||
[options release];
|
||||
#endif
|
||||
}
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[src release];
|
||||
#endif // GGML_METAL_EMBED_LIBRARY
|
||||
}
|
||||
if (ctx_dev->mtl_library == nil) {
|
||||
ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat);
|
||||
}
|
||||
id<MTLLibrary> metal_library = ctx_dev->mtl_library;
|
||||
if (metal_library == nil) {
|
||||
GGML_LOG_ERROR("%s: error: metal library is nil\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// print MTL GPU family:
|
||||
@@ -724,7 +743,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
[metal_function release]; \
|
||||
if (error) { \
|
||||
GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
[metal_library release]; \
|
||||
return NULL; \
|
||||
} \
|
||||
} else { \
|
||||
@@ -1043,8 +1061,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true);
|
||||
}
|
||||
|
||||
[metal_library release];
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
@@ -1944,34 +1960,38 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
|
||||
ggml_metal_kargs_sum_rows args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.ne03 =*/ ne03,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne10 =*/ ne10,
|
||||
/*.ne11 =*/ ne11,
|
||||
/*.ne12 =*/ ne12,
|
||||
/*.ne13 =*/ ne13,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.ne2 =*/ ne2,
|
||||
/*.ne3 =*/ ne3,
|
||||
/*.nb0 =*/ nb0,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
||||
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
|
||||
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11];
|
||||
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
|
||||
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
|
||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
|
||||
[encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:18];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:19];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:20];
|
||||
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:21];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:22];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:23];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:24];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:25];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
@@ -2020,8 +2040,17 @@ static void ggml_metal_encode_node(
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
// TODO: optimize (see https://github.com/ggml-org/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6)
|
||||
ggml_metal_kargs_soft_max args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.scale =*/ scale,
|
||||
/*.max_bias =*/ max_bias,
|
||||
/*.m0 =*/ m0,
|
||||
/*.m1 =*/ m1,
|
||||
/*.n_head_log2 =*/ n_head_log2,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
if (id_src1) {
|
||||
@@ -2030,14 +2059,7 @@ static void ggml_metal_encode_node(
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
}
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
||||
[encoder setBytes:&scale length:sizeof(scale) atIndex:6];
|
||||
[encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:7];
|
||||
[encoder setBytes:&m0 length:sizeof(m0) atIndex:8];
|
||||
[encoder setBytes:&m1 length:sizeof(m1) atIndex:9];
|
||||
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:3];
|
||||
|
||||
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
||||
|
||||
@@ -2055,13 +2077,16 @@ static void ggml_metal_encode_node(
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline;
|
||||
}
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_diag_mask_inf args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.n_past =*/ n_past,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&n_past length:sizeof(int) atIndex:4];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
if (ne00%8 == 0) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
@@ -2080,27 +2105,30 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_CONV_F32].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_ssm_conv args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.ne10 =*/ ne10,
|
||||
/*.ne11 =*/ ne11,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.ne2 =*/ ne2,
|
||||
/*.nb0 =*/ nb0,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
|
||||
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
|
||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:11];
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:12];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:15];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:16];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:17];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:18];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:3];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne1, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
@@ -2151,7 +2179,31 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_ssm_scan args = {
|
||||
/*.d_state =*/ d_state,
|
||||
/*.d_inner =*/ d_inner,
|
||||
/*.n_seq_tokens =*/ n_seq_tokens,
|
||||
/*.n_seqs =*/ n_seqs,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.nb20 =*/ nb20,
|
||||
/*.nb21 =*/ nb21,
|
||||
/*.nb22 =*/ nb22,
|
||||
/*.nb30 =*/ nb30,
|
||||
/*.nb31 =*/ nb31,
|
||||
/*.nb40 =*/ nb40,
|
||||
/*.nb41 =*/ nb41,
|
||||
/*.nb42 =*/ nb42,
|
||||
/*.nb50 =*/ nb50,
|
||||
/*.nb51 =*/ nb51,
|
||||
/*.nb52 =*/ nb52,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
@@ -2160,30 +2212,7 @@ static void ggml_metal_encode_node(
|
||||
[encoder setBuffer:id_src4 offset:offs_src4 atIndex:4];
|
||||
[encoder setBuffer:id_src5 offset:offs_src5 atIndex:5];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:6];
|
||||
|
||||
[encoder setBytes:&d_state length:sizeof(d_state) atIndex:7];
|
||||
[encoder setBytes:&d_inner length:sizeof(d_inner) atIndex:8];
|
||||
[encoder setBytes:&n_seq_tokens length:sizeof(n_seq_tokens) atIndex:9];
|
||||
[encoder setBytes:&n_seqs length:sizeof(n_seqs) atIndex:10];
|
||||
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:11];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:12];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:13];
|
||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
|
||||
[encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
|
||||
[encoder setBytes:&nb20 length:sizeof(nb20) atIndex:18];
|
||||
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:19];
|
||||
[encoder setBytes:&nb22 length:sizeof(nb22) atIndex:20];
|
||||
[encoder setBytes:&nb30 length:sizeof(nb30) atIndex:21];
|
||||
[encoder setBytes:&nb31 length:sizeof(nb31) atIndex:22];
|
||||
[encoder setBytes:&nb40 length:sizeof(nb40) atIndex:23];
|
||||
[encoder setBytes:&nb41 length:sizeof(nb41) atIndex:24];
|
||||
[encoder setBytes:&nb42 length:sizeof(nb42) atIndex:25];
|
||||
[encoder setBytes:&nb50 length:sizeof(nb50) atIndex:26];
|
||||
[encoder setBytes:&nb51 length:sizeof(nb51) atIndex:27];
|
||||
[encoder setBytes:&nb52 length:sizeof(nb52) atIndex:28];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:7];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
@@ -3040,19 +3069,22 @@ static void ggml_metal_encode_node(
|
||||
default: GGML_ABORT("not implemented");
|
||||
}
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_get_rows args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.ne10 =*/ ne10,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5];
|
||||
[encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6];
|
||||
[encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7];
|
||||
[encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:10];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:3];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
|
||||
} break;
|
||||
@@ -3109,18 +3141,21 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_group_norm args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.n_groups =*/ n_groups,
|
||||
/*.eps =*/ eps,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:5];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:6];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8];
|
||||
[encoder setBytes:&eps length:sizeof( float) atIndex:9];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
@@ -3278,8 +3313,8 @@ static void ggml_metal_encode_node(
|
||||
|
||||
const int32_t CHW = IC * KH * KW;
|
||||
|
||||
const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
|
||||
const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
|
||||
const uint64_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
|
||||
const uint64_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline;
|
||||
|
||||
@@ -3301,27 +3336,30 @@ static void ggml_metal_encode_node(
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_im2col args = {
|
||||
/*.ofs0 =*/ ofs0,
|
||||
/*.ofs1 =*/ ofs1,
|
||||
/*.IW =*/ IW,
|
||||
/*.IH =*/ IH,
|
||||
/*.CHW =*/ CHW,
|
||||
/*.s0 =*/ s0,
|
||||
/*.s1 =*/ s1,
|
||||
/*.p0 =*/ p0,
|
||||
/*.p1 =*/ p1,
|
||||
/*.d0 =*/ d0,
|
||||
/*.d1 =*/ d1,
|
||||
/*.N =*/ N,
|
||||
/*.KH =*/ KH,
|
||||
/*.KW =*/ KW,
|
||||
/*.KHW =*/ KH * KW,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ofs0 length:sizeof(int32_t) atIndex:2];
|
||||
[encoder setBytes:&ofs1 length:sizeof(int32_t) atIndex:3];
|
||||
[encoder setBytes:&IW length:sizeof(int32_t) atIndex:4];
|
||||
[encoder setBytes:&IH length:sizeof(int32_t) atIndex:5];
|
||||
[encoder setBytes:&CHW length:sizeof(int32_t) atIndex:6];
|
||||
[encoder setBytes:&s0 length:sizeof(int32_t) atIndex:7];
|
||||
[encoder setBytes:&s1 length:sizeof(int32_t) atIndex:8];
|
||||
[encoder setBytes:&p0 length:sizeof(int32_t) atIndex:9];
|
||||
[encoder setBytes:&p1 length:sizeof(int32_t) atIndex:10];
|
||||
[encoder setBytes:&d0 length:sizeof(int32_t) atIndex:11];
|
||||
[encoder setBytes:&d1 length:sizeof(int32_t) atIndex:12];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
if (is_gt_mttpt) {
|
||||
[encoder setBytes:&N length:sizeof(int32_t) atIndex:13];
|
||||
[encoder setBytes:&KH length:sizeof(int32_t) atIndex:14];
|
||||
[encoder setBytes:&KW length:sizeof(int32_t) atIndex:15];
|
||||
|
||||
const uint64_t n_threads = MIN(pipeline.maxTotalThreadsPerThreadgroup, (uint64_t)N);
|
||||
|
||||
const int64_t quotient = N / n_threads + (N % n_threads > 0 ? 1 : 0);
|
||||
@@ -3361,16 +3399,20 @@ static void ggml_metal_encode_node(
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
|
||||
ggml_metal_kargs_conv_transpose_1d args = {
|
||||
/*.IC =*/ IC,
|
||||
/*.IL =*/ IL,
|
||||
/*.K =*/ K,
|
||||
/*.s0 =*/ s0,
|
||||
/*.nb0 =*/ nb0,
|
||||
/*.nb1 =*/ nb1,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&IC length:sizeof( int32_t) atIndex:3];
|
||||
[encoder setBytes:&IL length:sizeof( int32_t) atIndex:4];
|
||||
[encoder setBytes:&K length:sizeof( int32_t) atIndex:5];
|
||||
[encoder setBytes:&s0 length:sizeof( int32_t) atIndex:6];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:3];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(OL, OC, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
@@ -3385,30 +3427,33 @@ static void ggml_metal_encode_node(
|
||||
|
||||
const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_upscale args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.ne03 =*/ ne03,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.ne2 =*/ ne2,
|
||||
/*.ne3 =*/ ne3,
|
||||
/*.nb0 =*/ nb0,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
/*.sf0 =*/ sf0,
|
||||
/*.sf1 =*/ sf1,
|
||||
/*.sf2 =*/ sf2,
|
||||
/*.sf3 =*/ sf3
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||
[encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18];
|
||||
[encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19];
|
||||
[encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20];
|
||||
[encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
|
||||
|
||||
@@ -3420,26 +3465,29 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_pad args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.ne03 =*/ ne03,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.ne2 =*/ ne2,
|
||||
/*.ne3 =*/ ne3,
|
||||
/*.nb0 =*/ nb0,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
@@ -3454,24 +3502,31 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32].pipeline;
|
||||
|
||||
ggml_metal_kargs_pad_reflect_1d args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.ne03 =*/ ne03,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.ne2 =*/ ne2,
|
||||
/*.ne3 =*/ ne3,
|
||||
/*.nb0 =*/ nb0,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
/*.p0 =*/ p0,
|
||||
/*.p1 =*/ p1
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:6];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:11];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:12];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:13];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:14];
|
||||
[encoder setBytes:&p0 length:sizeof(p0) atIndex:15];
|
||||
[encoder setBytes:&p1 length:sizeof(p1) atIndex:16];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
@@ -3489,12 +3544,15 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_arange args = {
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.start =*/ start,
|
||||
/*.step =*/ step
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:0];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:1];
|
||||
[encoder setBytes:&start length:sizeof(start) atIndex:2];
|
||||
[encoder setBytes:&step length:sizeof(step) atIndex:3];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:0];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:1];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
@@ -3511,13 +3569,16 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_timestep_embedding args = {
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.dim =*/ dim,
|
||||
/*.max_period =*/ max_period
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:2];
|
||||
[encoder setBytes:&dim length:sizeof(dim) atIndex:3];
|
||||
[encoder setBytes:&max_period length:sizeof(max_period) atIndex:4];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
const int nth = MIN(1024, half);
|
||||
|
||||
@@ -3550,12 +3611,15 @@ static void ggml_metal_encode_node(
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_argsort args = {
|
||||
/*.ncols =*/ ne00,
|
||||
/*.ncols_pad =*/ ne00_padded
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne00_padded length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00_padded, 1, 1)];
|
||||
@@ -3569,11 +3633,14 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_leaky_relu args = {
|
||||
/*.slope =*/ slope
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&slope length:sizeof(slope) atIndex:2];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
@@ -4149,21 +4216,24 @@ static void ggml_metal_encode_node(
|
||||
const int64_t n_threads = MIN((int64_t)[pipeline maxTotalThreadsPerThreadgroup], parallel_elements);
|
||||
const int64_t n_tg = (parallel_elements + n_threads - 1) / n_threads;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_pool_2d args_pool_2d = {
|
||||
/* .k0 = */ k0,
|
||||
/* .k1 = */ k1,
|
||||
/* .s0 = */ s0,
|
||||
/* .s1 = */ s1,
|
||||
/* .p0 = */ p0,
|
||||
/* .p1 = */ p1,
|
||||
/* .IH = */ IH,
|
||||
/* .IW = */ IW,
|
||||
/* .OH = */ OH,
|
||||
/* .OW = */ OW,
|
||||
/* .parallel_elements = */ parallel_elements
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&k0 length:sizeof(int32_t) atIndex:2];
|
||||
[encoder setBytes:&k1 length:sizeof(int32_t) atIndex:3];
|
||||
[encoder setBytes:&s0 length:sizeof(int32_t) atIndex:4];
|
||||
[encoder setBytes:&s1 length:sizeof(int32_t) atIndex:5];
|
||||
[encoder setBytes:&p0 length:sizeof(int32_t) atIndex:6];
|
||||
[encoder setBytes:&p1 length:sizeof(int32_t) atIndex:7];
|
||||
[encoder setBytes:&IH length:sizeof(int64_t) atIndex:8];
|
||||
[encoder setBytes:&IW length:sizeof(int64_t) atIndex:9];
|
||||
[encoder setBytes:&OH length:sizeof(int64_t) atIndex:10];
|
||||
[encoder setBytes:&OW length:sizeof(int64_t) atIndex:11];
|
||||
[encoder setBytes:¶llel_elements length:sizeof(int64_t) atIndex:12];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&args_pool_2d length:sizeof(args_pool_2d) atIndex:2];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n_tg, 1, 1) threadsPerThreadgroup:MTLSizeMake(n_threads, 1, 1)];
|
||||
} break;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -21,7 +21,7 @@ if (MUSAToolkit_FOUND)
|
||||
message(STATUS "MUSA Toolkit found")
|
||||
|
||||
if (NOT DEFINED MUSA_ARCHITECTURES)
|
||||
set(MUSA_ARCHITECTURES "21;22")
|
||||
set(MUSA_ARCHITECTURES "21;22;31")
|
||||
endif()
|
||||
message(STATUS "Using MUSA architectures: ${MUSA_ARCHITECTURES}")
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ if (GGML_OPENCL_PROFILING)
|
||||
endif ()
|
||||
|
||||
add_compile_definitions(GGML_OPENCL_SOA_Q)
|
||||
add_compile_definitions(GGML_OPENCL_TARGET_VERSION=${GGML_OPENCL_TARGET_VERSION})
|
||||
|
||||
if (GGML_OPENCL_USE_ADRENO_KERNELS)
|
||||
message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#define CL_TARGET_OPENCL_VERSION 220
|
||||
#define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION
|
||||
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
||||
|
||||
// suppress warnings in CL headers for GCC and Clang
|
||||
@@ -25,6 +25,8 @@
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
#include <memory>
|
||||
#include <charconv>
|
||||
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
@@ -62,6 +64,97 @@ enum ADRENO_GPU_GEN {
|
||||
X1E,
|
||||
};
|
||||
|
||||
struct ggml_cl_version {
|
||||
cl_uint major = 0;
|
||||
cl_uint minor = 0;
|
||||
};
|
||||
|
||||
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
|
||||
static ggml_cl_version parse_cl_version(std::string_view str) {
|
||||
size_t major_str_begin = 0;
|
||||
size_t major_str_end = str.find(".", major_str_begin);
|
||||
if (major_str_end == std::string::npos) {
|
||||
return {};
|
||||
}
|
||||
|
||||
size_t minor_str_begin = major_str_end + 1;
|
||||
size_t minor_str_end = str.find(" ", minor_str_begin);
|
||||
if (minor_str_end == std::string::npos) {
|
||||
return {};
|
||||
}
|
||||
|
||||
cl_uint version_major;
|
||||
if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) {
|
||||
return {};
|
||||
}
|
||||
|
||||
cl_uint version_minor;
|
||||
if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) {
|
||||
return {};
|
||||
}
|
||||
return { version_major, version_minor };
|
||||
}
|
||||
|
||||
// Returns OpenCL platform's version. On an error returns ggml_cl_version with all zeroes.
|
||||
static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) {
|
||||
size_t param_size;
|
||||
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, ¶m_size));
|
||||
std::unique_ptr<char[]> param_storage(new char[param_size]);
|
||||
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr));
|
||||
|
||||
auto param_value = std::string_view(param_storage.get(), param_size);
|
||||
const std::string version_prefix = "OpenCL "; // Suffix: "XX.YY <platform-specific-info>"
|
||||
if (param_value.find(version_prefix) != 0) {
|
||||
return {};
|
||||
}
|
||||
param_value.remove_prefix(version_prefix.length());
|
||||
return parse_cl_version(param_value);
|
||||
}
|
||||
|
||||
// Return a version to use in OpenCL C compilation. On an error returns ggml_cl_version with all zeroes.
|
||||
static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) {
|
||||
size_t param_size;
|
||||
|
||||
#if CL_TARGET_OPENCL_VERSION >= 300
|
||||
if (platform_version.major >= 3) {
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, ¶m_size));
|
||||
if (!param_size) {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::unique_ptr<cl_name_version[]> versions(new cl_name_version[param_size]);
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr));
|
||||
unsigned versions_count = param_size / sizeof(cl_name_version);
|
||||
|
||||
cl_version version_max = 0;
|
||||
for (unsigned i = 0; i < versions_count; i++) {
|
||||
version_max = std::max<cl_version>(versions[i].version, version_max);
|
||||
}
|
||||
|
||||
return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) };
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(platform_version);
|
||||
#endif // CL_TARGET_OPENCL_VERSION >= 300
|
||||
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, ¶m_size));
|
||||
if (!param_size) {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::unique_ptr<char[]> param_storage(new char[param_size]);
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr));
|
||||
auto param_value = std::string_view(param_storage.get(), param_size);
|
||||
|
||||
const std::string version_prefix = "OpenCL C "; // Suffix: "XX.YY <platform-specific-info>"
|
||||
if (param_value.find(version_prefix) != 0) {
|
||||
return {};
|
||||
}
|
||||
param_value.remove_prefix(version_prefix.length());
|
||||
|
||||
return parse_cl_version(param_value);
|
||||
}
|
||||
|
||||
static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
|
||||
if (strstr(device_name, "730") ||
|
||||
strstr(device_name, "740") ||
|
||||
@@ -278,7 +371,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
|
||||
cl_int err;
|
||||
|
||||
#ifdef GGML_PROFILE_OPENCL
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
|
||||
#endif
|
||||
|
||||
@@ -470,16 +563,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
// A local ref of cl_device_id for convenience
|
||||
cl_device_id device = backend_ctx->device;
|
||||
|
||||
// Check device OpenCL version, OpenCL 2.0 or above is required
|
||||
size_t device_ver_str_size;
|
||||
clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &device_ver_str_size);
|
||||
char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
|
||||
clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
|
||||
device_ver_buffer[device_ver_str_size] = '\0';
|
||||
GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
|
||||
ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
|
||||
|
||||
if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
|
||||
strstr(device_ver_buffer, "OpenCL 3") == NULL) {
|
||||
// Check device OpenCL version, OpenCL 2.0 or above is required
|
||||
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
|
||||
if (opencl_c_version.major < 2) {
|
||||
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
|
||||
return backend_ctx;
|
||||
}
|
||||
@@ -516,15 +604,17 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
|
||||
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
|
||||
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
|
||||
if (strstr(device_ver_buffer, "OpenCL 3") &&
|
||||
strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
||||
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
||||
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
|
||||
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
|
||||
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
|
||||
return backend_ctx;
|
||||
}
|
||||
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &backend_ctx->alignment, NULL));
|
||||
cl_uint base_align_in_bits;
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
|
||||
GGML_ASSERT(base_align_in_bits % 8u == 0);
|
||||
backend_ctx->alignment = base_align_in_bits / 8u;
|
||||
GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
|
||||
@@ -578,9 +668,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
const std::string kernel_src = read_file("ggml-opencl.cl");
|
||||
#endif
|
||||
|
||||
std::string compile_opts =
|
||||
"-cl-std=CL2.0 -cl-mad-enable -cl-unsafe-math-optimizations "
|
||||
"-cl-finite-math-only -cl-fast-relaxed-math ";
|
||||
auto opencl_c_std =
|
||||
std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
|
||||
|
||||
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
||||
" -cl-finite-math-only -cl-fast-relaxed-math";
|
||||
backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
// Non matmul kernels.
|
||||
@@ -690,10 +783,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
|
||||
|
||||
// Gemv general
|
||||
std::string CL_gemv_compile_opts =
|
||||
" -cl-std=CL2.0 "
|
||||
" -cl-mad-enable "
|
||||
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
|
||||
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable "
|
||||
" -DSIMDGROUP_WIDTH=" +
|
||||
std::to_string(backend_ctx->adreno_wave_size);
|
||||
if (has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||
}
|
||||
@@ -710,12 +803,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
|
||||
|
||||
// Gemv 2048, 16384
|
||||
CL_gemv_compile_opts =
|
||||
" -cl-std=CL2.0 "
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=2048 "
|
||||
" -DBLOCK_STRIDE_A=16384 "
|
||||
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
|
||||
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=2048 "
|
||||
" -DBLOCK_STRIDE_A=16384 "
|
||||
" -DSIMDGROUP_WIDTH=" +
|
||||
std::to_string(backend_ctx->adreno_wave_size);
|
||||
if (has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||
}
|
||||
@@ -732,12 +825,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
|
||||
|
||||
// Gemv 2048, 16384
|
||||
CL_gemv_compile_opts =
|
||||
" -cl-std=CL2.0 "
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=2048 "
|
||||
" -DBLOCK_STRIDE_A=16384 "
|
||||
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
|
||||
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=2048 "
|
||||
" -DBLOCK_STRIDE_A=16384 "
|
||||
" -DSIMDGROUP_WIDTH=" +
|
||||
std::to_string(backend_ctx->adreno_wave_size);
|
||||
if (has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||
}
|
||||
@@ -747,12 +840,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
|
||||
|
||||
// Gemv 5504, 44032
|
||||
CL_gemv_compile_opts =
|
||||
" -cl-std=CL2.0 "
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=5504 "
|
||||
" -DBLOCK_STRIDE_A=44032 "
|
||||
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
|
||||
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=5504 "
|
||||
" -DBLOCK_STRIDE_A=44032 "
|
||||
" -DSIMDGROUP_WIDTH=" +
|
||||
std::to_string(backend_ctx->adreno_wave_size);
|
||||
if (has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||
}
|
||||
@@ -762,12 +855,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
|
||||
|
||||
// Gemv 16000, 128000
|
||||
CL_gemv_compile_opts =
|
||||
" -cl-std=CL2.0 "
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=16000 "
|
||||
" -DBLOCK_STRIDE_A=128000 "
|
||||
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
|
||||
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=16000 "
|
||||
" -DBLOCK_STRIDE_A=128000 "
|
||||
" -DSIMDGROUP_WIDTH=" +
|
||||
std::to_string(backend_ctx->adreno_wave_size);
|
||||
if (has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||
}
|
||||
@@ -1004,17 +1097,18 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_MUL:
|
||||
return true;
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
case GGML_OP_CLAMP:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
@@ -1198,17 +1292,14 @@ struct ggml_backend_opencl_buffer_context {
|
||||
std::string name;
|
||||
};
|
||||
|
||||
static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
|
||||
|
||||
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return cl_ptr_base;
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device);
|
||||
return (void *) (uintptr_t) backend_ctx->alignment;
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
@@ -1241,7 +1332,7 @@ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buff
|
||||
tensor->extra = view_extra;
|
||||
} else {
|
||||
{
|
||||
size_t offset = (char *)tensor->data - (char *)cl_ptr_base;
|
||||
size_t offset = (char *) tensor->data - (char *) ggml_backend_opencl_buffer_get_base(buffer);
|
||||
|
||||
ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra();
|
||||
extra->offset = offset;
|
||||
@@ -2573,26 +2664,33 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
const int ne00 = src0 ? src0->ne[0] : 0;
|
||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
||||
const int ne01 = src0 ? src0->ne[1] : 0;
|
||||
const int ne02 = src0 ? src0->ne[2] : 0;
|
||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
||||
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
||||
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
||||
|
||||
const int nth = MIN(64, ne00);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_norm;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth, NULL));
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
|
||||
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
@@ -2630,16 +2728,19 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
const int ne00 = src0 ? src0->ne[0] : 0;
|
||||
const int ne01 = src0 ? src0->ne[1] : 0;
|
||||
const int ne02 = src0 ? src0->ne[2] : 0;
|
||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
||||
|
||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
||||
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
||||
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
||||
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
|
||||
const int nth = MIN(64, ne00);
|
||||
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_rms_norm;
|
||||
@@ -2654,15 +2755,20 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
sizeof(local_work_size), local_work_size,
|
||||
sizeof(size_t), &sgs, NULL));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
||||
// This is local memory - the size depends on subgroup size.
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth/sgs, NULL));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
@@ -3023,6 +3129,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
// enqueue kernel with profiling
|
||||
// <--------------------------------------------> //
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
@@ -3764,10 +3871,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
const int ne02 = src0 ? src0->ne[2] : 0;
|
||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
||||
|
||||
const int nb00 = src0 ? src0->nb[0] : 0;
|
||||
const int nb01 = src0 ? src0->nb[1] : 0;
|
||||
const int nb02 = src0 ? src0->nb[2] : 0;
|
||||
const int nb03 = src0 ? src0->nb[3] : 0;
|
||||
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
|
||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
||||
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
||||
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
||||
|
||||
const int ne10 = src1 ? src1->ne[0] : 0;
|
||||
const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11);
|
||||
@@ -3779,10 +3886,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
const int ne2 = dst ? dst->ne[2] : 0;
|
||||
const int ne3 = dst ? dst->ne[3] : 0;
|
||||
|
||||
const int nb0 = dst ? dst->nb[0] : 0;
|
||||
const int nb1 = dst ? dst->nb[1] : 0;
|
||||
const int nb2 = dst ? dst->nb[2] : 0;
|
||||
const int nb3 = dst ? dst->nb[3] : 0;
|
||||
const cl_ulong nb0 = dst ? dst->nb[0] : 0;
|
||||
const cl_ulong nb1 = dst ? dst->nb[1] : 0;
|
||||
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
||||
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
||||
|
||||
GGML_ASSERT(ne10 % ne02 == 0);
|
||||
GGML_ASSERT(ne10 >= ne02);
|
||||
|
||||
@@ -506,14 +506,23 @@ kernel void kernel_norm(
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
float eps,
|
||||
local float * sum
|
||||
) {
|
||||
src0 = (global void*)((global char*)src0 + offset0);
|
||||
dst = (global void*)((global char*)dst + offsetd);
|
||||
|
||||
global float * x = (global float *) ((global char *) src0 + get_group_id(0)*nb01);
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
global float * x = (global float *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
|
||||
// MEAN
|
||||
// parallel sum
|
||||
@@ -533,7 +542,7 @@ kernel void kernel_norm(
|
||||
|
||||
// recenter and VARIANCE
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
global float * y = dst + get_group_id(0)*ne00;
|
||||
global float * y = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
sum[get_local_id(0)] = 0.0f;
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
y[i00] = x[i00] - mean;
|
||||
@@ -566,14 +575,23 @@ kernel void kernel_rms_norm(
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
float eps,
|
||||
local float * sum // Note, the size depends on number of subgroups
|
||||
) {
|
||||
src0 = (global void*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
global float4 * x = (global float4 *) ((global char *) src0 + get_group_id(0)*nb01);
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
global float4 * x = (global float4 *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
global float * x_scalar = (global float *) x;
|
||||
float4 sumf = 0;
|
||||
float all_sum = 0;
|
||||
@@ -607,7 +625,7 @@ kernel void kernel_rms_norm(
|
||||
const float mean = sum[0];
|
||||
const float scale = 1.0f/sqrt(mean + eps);
|
||||
|
||||
global float4 * y = (global float4 *) (dst + get_group_id(0)*ne00);
|
||||
global float4 * y = (global float4 *) (dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
global float * y_scalar = (global float *) y;
|
||||
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
||||
y[i00] = x[i00] * scale;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user