Compare commits

..

1 Commits

Author SHA1 Message Date
Georgi Gerganov
aefa65e442 ci : fix save-load test invokations 2025-03-07 12:17:33 +02:00
233 changed files with 19672 additions and 30985 deletions

View File

@@ -676,35 +676,6 @@ jobs:
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
macOS-latest-cmake-visionos:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=visionOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
macOS-latest-swift:
runs-on: macos-latest
@@ -803,7 +774,7 @@ jobs:
env:
OPENBLAS_VERSION: 0.3.23
SDE_VERSION: 9.33.0-2024-01-07
VULKAN_VERSION: 1.4.309.0
VULKAN_VERSION: 1.3.261.1
strategy:
matrix:
@@ -1408,7 +1379,7 @@ jobs:
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

61
AUTHORS
View File

@@ -1,4 +1,4 @@
# date: Sat Mar 8 18:23:52 EET 2025
# date: Tue Feb 4 13:04:05 EET 2025
# this file is auto-generated by scripts/gen-authors.sh
0cc4m <picard12@live.de>
@@ -8,12 +8,10 @@
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
44670 <44670@users.noreply.github.com>
65a <10104049+65a@users.noreply.github.com>
708-145 <40387547+708-145@users.noreply.github.com>
AN Long <aisk@users.noreply.github.com>
AT <manyoso@users.noreply.github.com>
Aarni Koskela <akx@iki.fi>
Aaron Miller <apage43@ninjawhale.com>
Aaron Teo <57927438+taronaeo@users.noreply.github.com>
Aaryaman Vasishta <aaryaman.vasishta@amd.com>
Abheek Gulati <abheekg@hotmail.com>
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
@@ -22,7 +20,6 @@ Adithya Balaji <adithya.b94@gmail.com>
AdithyanI <adithyan.i4internet@gmail.com>
Adrian <smith.adriane@gmail.com>
Adrian Hesketh <a-h@users.noreply.github.com>
Adrian Kretz <me@akretz.com>
Adrien Gallouët <adrien@gallouet.fr>
Adrien Gallouët <angt@huggingface.co>
Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
@@ -31,18 +28,15 @@ AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
AidanBeltonS <aidan.belton@codeplay.com>
Aisuko <urakiny@gmail.com>
Akarshan Biswas <akarshan.biswas@gmail.com>
Akarshan Biswas <akarshan@menlo.ai>
Akarshan Biswas <akarshanbiswas@fedoraproject.org>
Al Mochkin <14274697+amochkin@users.noreply.github.com>
Albert Jin <albert.jin@gmail.com>
Alberto <57916483+albbus-stack@users.noreply.github.com>
Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
Alberto Cabrera Pérez <alberto.cabrera@intel.com>
Aleksei Nikiforov <103434461+AlekseiNikiforovIBM@users.noreply.github.com>
Alex <awhill19@icloud.com>
Alex Azarov <alex@azarov.by>
Alex Azarov <alexander.azarov@mapbox.com>
Alex Brooks <alex.brooks@ibm.com>
Alex Klinkhamer <from.github.com.917@grencez.dev>
Alex Klinkhamer <git@grencez.dev>
Alex Nguyen <tiendung@users.noreply.github.com>
@@ -73,7 +67,6 @@ Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
Andy Salerno <andysalerno@gmail.com>
Andy Tai <andy-tai@users.noreply.github.com>
Anthony Van de Gejuchte <anthonyvdgent@gmail.com>
Antoine Viallon <antoine@lesviallon.fr>
Antonis Makropoulos <benuix@gmail.com>
Arik Poznanski <arikpoz@users.noreply.github.com>
Armen Kaleshian <kriation@users.noreply.github.com>
@@ -90,7 +83,6 @@ Atsushi Tatsuma <yoshoku@outlook.com>
Austin <77757836+teleprint-me@users.noreply.github.com>
AustinMroz <austinmroz@utexas.edu>
BADR <contact@pythops.com>
BB-fat <45072480+BB-fat@users.noreply.github.com>
Bach Le <bach@bullno1.com>
Bailey Chittle <39804642+bachittle@users.noreply.github.com>
BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
@@ -109,7 +101,6 @@ Bert Wagner <github@bertwagner.com>
Billel Mokeddem <billel.mokeddem.ml@gmail.com>
Bingan <70050083+binganao@users.noreply.github.com>
Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
Bodhi <3882561+BodhiHu@users.noreply.github.com>
Bodo Graumann <mail@bodograumann.de>
Bono Lv <lvscar@users.noreply.github.com>
Borislav Stanimirov <b.stanimirov@abv.bg>
@@ -137,7 +128,6 @@ CentricStorm <CentricStorm@users.noreply.github.com>
Chad Brewbaker <crb002@gmail.com>
Changyeon Kim <cyzero.kim@samsung.com>
Chao Jiang <jc19chaoj@zoho.com>
Charles Duffy <charles@dyfis.net>
Charles Xu <63788048+chaxu01@users.noreply.github.com>
Charles Xu <charles.xu@arm.com>
Chen Xi <xi2.chen@intel.com>
@@ -149,14 +139,12 @@ Chris Kuehl <ckuehl@ckuehl.me>
Christian Demsar <christian@github.email.demsar.us>
Christian Demsar <crasm@git.vczf.us>
Christian Falch <875252+chrfalch@users.noreply.github.com>
Christian Fillion <cfillion@users.noreply.github.com>
Christian Kastner <ckk@kvr.at>
Christian Kögler <ck3d@gmx.de>
Christian Köhnenkamp <cvk5@me.com>
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
Christopher Nielsen <62156882+mascguy@users.noreply.github.com>
Clark Saben <76020733+csaben@users.noreply.github.com>
Clauszy <zhangyub@uniontech.com>
Clint Herron <hanclinto@gmail.com>
Conrad Kramer <conrad@conradkramer.com>
Corentin REGAL <corentin.regal@gmail.com>
@@ -175,7 +163,6 @@ Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Daniel Illescas Romero <illescas.daniel@protonmail.com>
Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
Daniele <57776841+daniandtheweb@users.noreply.github.com>
Danny Milosavljevic <dannym@friendly-machines.com>
DannyDaemonic <DannyDaemonic@gmail.com>
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
Dave <dave-fl@users.noreply.github.com>
@@ -183,7 +170,6 @@ Dave Airlie <airlied@gmail.com>
Dave Airlie <airlied@redhat.com>
Dave Della Costa <ddellacosta+github@gmail.com>
David Friehs <david@friehs.info>
David Huang <1969802+hjc4869@users.noreply.github.com>
David Kennedy <dakennedyd@gmail.com>
David Pflug <david@pflug.email>
David Renshaw <dwrenshaw@gmail.com>
@@ -250,7 +236,6 @@ Felix <stenbackfelix@gmail.com>
Finn Voorhees <finnvoorhees@gmail.com>
Firat <firatkiral@gmail.com>
FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
Florent BENOIT <fbenoit@redhat.com>
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
@@ -269,7 +254,6 @@ Gary Mulder <gjmulder@gmail.com>
Gavin Zhao <gavinzhaojw@protonmail.com>
Genkagaku.GPT <hlhr202@163.com>
Georgi Gerganov <ggerganov@gmail.com>
Gian-Carlo Pascutto <gcp@sjeng.org>
Gilad S <giladgd@users.noreply.github.com>
Gilad S. <7817232+giladgd@users.noreply.github.com>
Giuseppe Scrivano <giuseppe@scrivano.org>
@@ -283,9 +267,7 @@ Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com>
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
Haggai Nuchi <h.nuchi@gmail.com>
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
Hale Chan <halechan@qq.com>
Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com>
Han Yin <han.yin@arm.com>
HanishKVC <hanishkvc@gmail.com>
Haohui Mai <ricetons@gmail.com>
Haoxiang Fei <tonyfettes@tonyfettes.com>
@@ -296,7 +278,6 @@ Haus1 <haus.xda@gmail.com>
Henk Poley <HenkPoley@gmail.com>
Henri Vasserman <henv@hot.ee>
Henrik Forstén <henrik.forsten@gmail.com>
Henry Linjamäki <henry.linjamaki@gmail.com>
Herman Semenov <GermanAizek@yandex.ru>
Hesen Peng <hesen.peng@gmail.com>
HimariO <dsfhe49854@gmail.com>
@@ -326,7 +307,6 @@ Ivan <nekotekina@gmail.com>
Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
Ivan Komarov <Ivan.Komarov@dfyz.info>
Ivan Stepanov <ivanstepanovftw@gmail.com>
JC <43374599+MrSMlT@users.noreply.github.com>
JFLFY2255 <JFLFY2255@163.com>
JH23X <165871467+JH23X@users.noreply.github.com>
Jack Mousseau <jack@software.inc>
@@ -345,7 +325,6 @@ Jan Ploski <jpl@plosquare.com>
Jannis Schönleber <joennlae@gmail.com>
Jared Van Bortel <cebtenzzre@gmail.com>
Jared Van Bortel <jared@nomic.ai>
Jason C.H <ctrysbita@outlook.com>
Jason McCartney <jmac@theroot.org>
Jason Stillerman <jason.t.stillerman@gmail.com>
Jean-Christophe Hoelt <hoelt@fovea.cc>
@@ -363,7 +342,6 @@ Jiahao Li <liplus17@163.com>
Jian Liao <jianliao@users.noreply.github.com>
JidongZhang-THU <1119708529@qq.com>
Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
Jinyang He <hejinyang@loongson.cn>
Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
Jiří Sejkora <Sejseloid@gmail.com>
Joan Fontanals <jfontanalsmartinez@gmail.com>
@@ -401,7 +379,6 @@ Justine Tunney <jtunney@mozilla.com>
Juuso Alasuutari <juuso.alasuutari@gmail.com>
KASR <karim.asrih@gmail.com>
Kamil Tomšík <info@tomsik.cz>
Kante Yin <kerthcet@gmail.com>
Karol Kontny <82021046+kkontny@users.noreply.github.com>
Karsten Weiss <knweiss@gmail.com>
Karthick <j.karthic2004@gmail.com>
@@ -442,7 +419,6 @@ LoganDark <github@logandark.mozmail.com>
Loïc Carrère <loic.carrere@gmail.com>
LostRuins <39025047+LostRuins@users.noreply.github.com>
LostRuins Concedo <39025047+LostRuins@users.noreply.github.com>
Lucas Moura Belo <lucas.belo@live.com>
Luciano <lucianostrika44@gmail.com>
Luo Tian <lt@basecity.com>
Lyle Dean <dean@lyle.dev>
@@ -487,7 +463,6 @@ Matthew Tejo <matthew.tejo@gmail.com>
Matvey Soloviev <blackhole89@gmail.com>
Max Krasnyansky <max.krasnyansky@gmail.com>
Max Krasnyansky <quic_maxk@quicinc.com>
Maxim Evtush <154841002+maximevtush@users.noreply.github.com>
Maxime <672982+maximegmd@users.noreply.github.com>
Maximilian Winter <maximilian.winter.91@gmail.com>
Meng Zhang <meng@tabbyml.com>
@@ -519,7 +494,6 @@ Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
Molly Sophia <mollysophia379@gmail.com>
MoonRide303 <130458190+MoonRide303@users.noreply.github.com>
MorganRO8 <47795945+MorganRO8@users.noreply.github.com>
Murilo Santana <mvrilo@gmail.com>
Musab Gultekin <musabgultekin@users.noreply.github.com>
@@ -550,7 +524,6 @@ Nikolas <127742645+nneubacher@users.noreply.github.com>
Nindaleth <Nindaleth@users.noreply.github.com>
Nuno <rare-magma@posteo.eu>
OSecret <135510162+OLSecret@users.noreply.github.com>
Oleksandr Kuvshynov <661042+okuvshynov@users.noreply.github.com>
Oleksandr Nikitin <oleksandr@tvori.info>
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
Olivier Chafik <ochafik@users.noreply.github.com>
@@ -560,7 +533,6 @@ PAB <pierreantoine.bannier@gmail.com>
Pablo Duboue <pablo.duboue@gmail.com>
Pascal Patry <ppatry@mtacitlabs.com>
Patrice Ferlet <metal3d@gmail.com>
Patrick Peng <retr0@retr0.blog>
Paul Tsochantaris <ptsochantaris@icloud.com>
Pavel Zloi <github.com@drteam.rocks>
Pavol Rusnak <pavol@rusnak.io>
@@ -577,7 +549,6 @@ Pieter Ouwerkerk <pieter.ouwerkerk@gmail.com>
Plamen Minev <pacominev@gmail.com>
Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
Przemysław Pawełczyk <przemoc@gmail.com>
PureJourney <edward.pong@qq.com>
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
Qingyou Meng <meng.qingyou@gmail.com>
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
@@ -593,17 +564,14 @@ Rand Xie <randxiexyy29@gmail.com>
Randall Fitzgerald <randall@dasaku.net>
Random Fly <renfei8@live.cn>
Reinforce-II <fate@eastal.com>
Rémy O <remyoudompheng@gmail.com>
Rémy Oudompheng <oudomphe@phare.normalesup.org>
Ren Xuancheng <jklj077@users.noreply.github.com>
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
Reza Kakhki <rezakakhki.de@gmail.com>
Reza Rahemtola <49811529+RezaRahemtola@users.noreply.github.com>
RhinoDevel <RhinoDevel@users.noreply.github.com>
Riccardo Orlando <Riccorl@users.noreply.github.com>
Riceball LEE <snowyu.lee@gmail.com>
Rich Dougherty <rich@rd.nz>
Richard <r-burton@hotmail.co.uk>
Richard Kiss <him@richardkiss.com>
Richard Roberson <richardr1126@gmail.com>
Rick G <26732651+TheFlipbook@users.noreply.github.com>
@@ -620,7 +588,6 @@ Robert Sung-wook Shin <edp1096@users.noreply.github.com>
Robey Holderith <robey@flaminglunchbox.net>
Robyn <robyngraf@users.noreply.github.com>
Roger Meier <r.meier@siemens.com>
Rohanjames1997 <rohan.james4@gmail.com>
Roland <14355895+rbur0425@users.noreply.github.com>
Romain Biessy <romain.biessy@codeplay.com>
Romain D <90720+Artefact2@users.noreply.github.com>
@@ -643,7 +610,6 @@ Ryan Landay <rlanday@gmail.com>
Ryder Wishart <ryderwishart@gmail.com>
Ryuei <louixs@users.noreply.github.com>
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
SAMI <samuel.koesnadi@stud.uni-due.de>
SRHMorris <69468379+SRHMorris@users.noreply.github.com>
SXX <sxx1136965276@gmail.com>
SakuraUmi <yukinon244@gmail.com>
@@ -668,8 +634,6 @@ Shane A <shanea@allenai.org>
Shangning Xu <32517059+xushangning@users.noreply.github.com>
Shankar <gshankar.87@gmail.com>
Shanshan Shen <467638484@qq.com>
Shelby Jenkins <47464908+ShelbyJenkins@users.noreply.github.com>
Sheldon Robinson <sheldon.robinson@live.com>
Shijie <821898965@qq.com>
Shintarou Okada <kokuzen@gmail.com>
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
@@ -749,24 +713,18 @@ Victor Nogueira <felladrin@gmail.com>
Victor Z. Peng <ziliangdotme@gmail.com>
Viet-Anh NGUYEN (Andrew) <vietanh.dev@gmail.com>
Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
Vitali Lovich <vlovich+github@gmail.com>
Vivian <vynride@gmail.com>
Vlad <spitfireage@gmail.com>
Vladimir <bogdad@gmail.com>
Vladimir Malyutin <first-leon@yandex.ru>
Vladimir Vuksanovic <109677816+vvuksanovic@users.noreply.github.com>
Vladimir Zorin <vladimir@deviant.guru>
VoidIsVoid <343750470@qq.com>
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
Wagner Bruna <wbruna@users.noreply.github.com>
Wang Qin <37098874+wangqin0@users.noreply.github.com>
Wang Ran (汪然) <wangr@smail.nju.edu.cn>
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
Weird Constructor <weirdconstructor@gmail.com>
Weizhao Ouyang <o451686892@gmail.com>
Welby Seely <welbyseely@gmail.com>
Wentai Zhang <rchardx@gmail.com>
Wilken Gottwalt <12194808+wgottwalt@users.noreply.github.com>
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
William Tambellini <william.tambellini@gmail.com>
William Tambellini <wtambellini@sdl.com>
@@ -858,8 +816,6 @@ chaihahaha <chai836275709@gmail.com>
chiranko <96988916+chiranko@users.noreply.github.com>
clibdev <52199778+clibdev@users.noreply.github.com>
clyang <clyang@clyang.net>
cmdr2 <secondary.cmdr2@gmail.com>
cmdr2 <shashank.shekhar.global@gmail.com>
cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
codezjx <code.zjx@gmail.com>
coezbek <c.oezbek@gmail.com>
@@ -879,7 +835,6 @@ deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
devojony <61173062+devojony@users.noreply.github.com>
ditsuke <ditsuke@protonmail.com>
divinity76 <divinity76@gmail.com>
dm4 <dm4@secondstate.io>
dm4 <sunrisedm4@gmail.com>
dotpy314 <33351922+dotpy314@users.noreply.github.com>
drbh <david.richard.holtz@gmail.com>
@@ -894,7 +849,6 @@ fairydreaming <166155368+fairydreaming@users.noreply.github.com>
fengerhu1 <2748250768@qq.com>
fj-y-saito <85871716+fj-y-saito@users.noreply.github.com>
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
fxzjshm <11426482+fxzjshm@users.noreply.github.com>
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
gliptic <gliptic@users.noreply.github.com>
gn64 <yukikaze.jp@gmail.com>
@@ -919,7 +873,6 @@ hydai <z54981220@gmail.com>
iSma <ismail.senhaji@gmail.com>
iacore <74560659+iacore@users.noreply.github.com>
icppWorld <124377669+icppWorld@users.noreply.github.com>
igardev <49397134+igardev@users.noreply.github.com>
igarnier <igarnier@protonmail.com>
intelmatt <61025942+intelmatt@users.noreply.github.com>
iohub <rickyang.pro@gmail.com>
@@ -927,7 +880,6 @@ issixx <46835150+issixx@users.noreply.github.com>
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
jameswu2014 <545426914@qq.com>
jason_w <jason.wang@126.com>
jdomke <28772296+jdomke@users.noreply.github.com>
jiahao su <damow890@gmail.com>
jiez <373447296@qq.com>
@@ -939,7 +891,6 @@ jon-chuang <9093549+jon-chuang@users.noreply.github.com>
jp-x-g <jpxg-dev@protonmail.com>
jukofyork <69222624+jukofyork@users.noreply.github.com>
junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
junchao-zhao <68935141+junchao-loongson@users.noreply.github.com>
jwj7140 <32943891+jwj7140@users.noreply.github.com>
k.h.lai <adrian.k.h.lai@outlook.com>
kaizau <kaizau@users.noreply.github.com>
@@ -974,7 +925,6 @@ ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
luoyu-intel <yu.luo@intel.com>
m3ndax <adrian.goessl@outlook.com>
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
magicse <magicse@users.noreply.github.com>
mahorozte <41834471+mahorozte@users.noreply.github.com>
makomk <makosoft@googlemail.com>
manikbhandari <mbbhandarimanik2@gmail.com>
@@ -985,7 +935,6 @@ matt23654 <matthew.webber@protonmail.com>
matteo <matteogeniaccio@yahoo.it>
mdrokz <mohammadmunshi@gmail.com>
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
midnight <midnightmagic@users.noreply.github.com>
minarchist <minarchist@users.noreply.github.com>
mj-shifu <77107165+mj-shifu@users.noreply.github.com>
mmyjona <jonathan.gonse@gmail.com>
@@ -1009,12 +958,10 @@ omahs <73983677+omahs@users.noreply.github.com>
oobabooga <112222186+oobabooga@users.noreply.github.com>
opparco <parco.opaai@gmail.com>
ostix360 <55257054+ostix360@users.noreply.github.com>
pascal-lc <49066376+pascal-lc@users.noreply.github.com>
pculliton <phillipculliton@gmail.com>
peidaqi <peidaqi@gmail.com>
pengxin99 <pengxin.yuan@intel.com>
perserk <perserk@gmail.com>
petterreinholdtsen <pere-github@hungry.com>
piDack <104877312+piDack@users.noreply.github.com>
pmysl <piotr.myslinski@outlook.com>
postmasters <namnguyen@google.com>
@@ -1036,7 +983,6 @@ semidark <me@semidark.net>
serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
sharpHL <132747147+sharpHL@users.noreply.github.com>
shibe2 <shibe@tuta.io>
simon886212 <37953122+simon886212@users.noreply.github.com>
singularity <12184989+singularity-s0@users.noreply.github.com>
sjinzh <sjinzh@gmail.com>
sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
@@ -1054,12 +1000,10 @@ tarcey <cey.tarik@gmail.com>
tc-mb <157115220+tc-mb@users.noreply.github.com>
texmex76 <40733439+texmex76@users.noreply.github.com>
thement <40525767+thement@users.noreply.github.com>
theraininsky <76763719+theraininsky@users.noreply.github.com>
thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
tjohnman <tjohnman@users.noreply.github.com>
toyer <2042519524@qq.com>
tslmy <tslmy@users.noreply.github.com>
tv1wnd <55383215+tv1wnd@users.noreply.github.com>
ubik2 <ubik2@users.noreply.github.com>
uint256_t <konndennsa@gmail.com>
uint256_t <maekawatoshiki1017@gmail.com>
@@ -1070,7 +1014,6 @@ valiray <133289098+valiray@users.noreply.github.com>
vb <vaibhavs10@gmail.com>
vik <vikhyatk@gmail.com>
viric <viric@viric.name>
vmobilis <75476228+vmobilis@users.noreply.github.com>
vodkaslime <646329483@qq.com>
vvhg1 <94630311+vvhg1@users.noreply.github.com>
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
@@ -1085,8 +1028,6 @@ wzy <32936898+Freed-Wu@users.noreply.github.com>
xaedes <xaedes@gmail.com>
xaedes <xaedes@googlemail.com>
xctan <axunlei@gmail.com>
xiaobing318 <71554036+xiaobing318@users.noreply.github.com>
xiaofei <hbuxiaofei@gmail.com>
xloem <0xloem@gmail.com>
yangli2 <yangli2@gmail.com>
ymcki <84055651+ymcki@users.noreply.github.com>

View File

@@ -29,8 +29,6 @@ else()
set(LLAMA_STANDALONE OFF)
endif()
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
if (EMSCRIPTEN)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
@@ -147,13 +145,7 @@ endif()
# 3rd-party
#
if (LLAMA_USE_SYSTEM_GGML)
message(STATUS "Using system-provided libggml, skipping ggml build")
find_package(ggml REQUIRED)
add_library(ggml ALIAS ggml::ggml)
endif()
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
if (NOT TARGET ggml)
add_subdirectory(ggml)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()

View File

@@ -836,7 +836,7 @@ ifdef GGML_MUSA
else
MUSA_PATH ?= /opt/musa
endif
MUSA_ARCHITECTURES ?= 21;22;31
MUSA_ARCHITECTURES ?= 21;22
MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib

View File

@@ -112,8 +112,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
#### Multimodal
@@ -174,7 +172,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
- [johnbean393/Sidekick](https://github.com/johnbean393/Sidekick) (MIT)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)

View File

@@ -432,8 +432,8 @@ cmake -B build-visionos -G Xcode \
-DCMAKE_SYSTEM_NAME=visionOS \
-DCMAKE_OSX_SYSROOT=xros \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
-S .
cmake --build build-visionos --config Release -- -quiet
@@ -445,8 +445,8 @@ cmake -B build-visionos-sim -G Xcode \
-DCMAKE_SYSTEM_NAME=visionOS \
-DCMAKE_OSX_SYSROOT=xrsimulator \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
-S .
cmake --build build-visionos-sim --config Release -- -quiet

View File

@@ -26,43 +26,4 @@ GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with SYCL support
source /opt/intel/oneapi/setvars.sh
GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with MUSA support
GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
```
## Running MUSA CI in a Docker Container
Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
### 1. Create a local directory to store cached models, configuration files and venv:
```bash
mkdir -p $HOME/llama.cpp/ci-cache
```
### 2. Create a local directory to store CI run results:
```bash
mkdir -p $HOME/llama.cpp/ci-results
```
### 3. Start a Docker container and run the CI:
```bash
docker run --privileged -it \
-v $HOME/llama.cpp/ci-cache:/ci-cache \
-v $HOME/llama.cpp/ci-results:/ci-results \
-v $PWD:/ws -w /ws \
mthreads/musa:rc3.1.1-devel-ubuntu22.04
```
Inside the container, execute the following commands:
```bash
apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
git config --global --add safe.directory /ws
GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
```
This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.

View File

@@ -16,9 +16,6 @@
# # with VULKAN support
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with MUSA support
# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>"
@@ -55,22 +52,13 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
echo "source /opt/intel/oneapi/setvars.sh"
exit 1
fi
# Use only main GPU
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
# Enable sysman for correct memory reporting
export ZES_ENABLE_SYSMAN=1
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
fi
if [ ! -z ${GG_BUILD_VULKAN} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
fi
if [ ! -z ${GG_BUILD_MUSA} ]; then
# Use qy1 by default (MTT S80)
MUSA_ARCH=${MUSA_ARCH:-21}
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
fi
## helpers
# download a file if it does not exist or if it is outdated
@@ -820,7 +808,7 @@ export LLAMA_LOG_PREFIX=1
export LLAMA_LOG_TIMESTAMPS=1
if [ -z ${GG_BUILD_LOW_PERF} ]; then
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
rm -rf ${SRC}/models-mnt
mnt_models=${MNT}/models
mkdir -p ${mnt_models}
@@ -838,10 +826,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
fi
ret=0
if [ -z ${GG_BUILD_SYCL} ]; then
# SYCL build breaks with debug build flags
test $ret -eq 0 && gg_run ctest_debug
fi
test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then
@@ -849,9 +835,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run rerank_tiny
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
if [ -z ${GG_BUILD_SYCL} ]; then
test $ret -eq 0 && gg_run test_scripts_debug
fi
test $ret -eq 0 && gg_run test_scripts_debug
test $ret -eq 0 && gg_run test_scripts_release
fi
@@ -862,9 +846,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run pythia_2_8b
#test $ret -eq 0 && gg_run open_llama_7b_v2
fi
if [ -z ${GG_BUILD_SYCL} ]; then
test $ret -eq 0 && gg_run ctest_with_model_debug
fi
test $ret -eq 0 && gg_run ctest_with_model_debug
test $ret -eq 0 && gg_run ctest_with_model_release
fi
fi

View File

@@ -1,5 +1,3 @@
include("ggml/cmake/common.cmake")
function(llama_add_compile_flags)
if (LLAMA_FATAL_WARNINGS)
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")

View File

@@ -114,8 +114,8 @@ if (LLAMA_LLGUIDANCE)
ExternalProject_Add(llguidance_ext
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
# v0.7.10:
GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
# v0.6.12:
GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
PREFIX ${CMAKE_BINARY_DIR}/llguidance
SOURCE_DIR ${LLGUIDANCE_SRC}
BUILD_IN_SOURCE TRUE

View File

@@ -764,11 +764,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_CTX_SIZE"));
add_opt(common_arg(
{"-n", "--predict", "--n-predict"}, "N",
string_format(
ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
: "number of tokens to predict (default: %d, -1 = infinity)",
params.n_predict),
string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
[](common_params & params, int value) {
params.n_predict = value;
}
@@ -853,20 +849,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_excludes({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-sysf", "--system-prompt-file"}, "FNAME",
"a file containing the system prompt (default: none)",
[](common_params & params, const std::string & value) {
std::ifstream file(value);
if (!file) {
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
}
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
params.system_prompt.pop_back();
}
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg(
{"--in-file"}, "FNAME",
"an input file (repeat to specify multiple files)",
@@ -1885,11 +1867,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
add_opt(common_arg(
{"-o", "--output", "--output-file"}, "FNAME",
string_format("output file (default: '%s')", params.out_file.c_str()),
string_format("output file (default: '%s')",
ex == LLAMA_EXAMPLE_EXPORT_LORA
? params.lora_outfile.c_str()
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
? params.cvector_outfile.c_str()
: params.out_file.c_str()),
[](common_params & params, const std::string & value) {
params.out_file = value;
params.cvector_outfile = value;
params.lora_outfile = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
add_opt(common_arg(
{"-ofreq", "--output-frequency"}, "N",
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -1979,7 +1968,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
add_opt(common_arg(
{"--host"}, "HOST",
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
string_format("ip address to listen (default: %s)", params.hostname.c_str()),
[](common_params & params, const std::string & value) {
params.hostname = value;
}
@@ -2582,43 +2571,5 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--fim-qwen-7b-spec"},
string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
[](common_params & params) {
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
params.speculative.n_gpu_layers = 99;
params.port = 8012;
params.n_gpu_layers = 99;
params.flash_attn = true;
params.n_ubatch = 1024;
params.n_batch = 1024;
params.n_ctx = 0;
params.n_cache_reuse = 256;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--fim-qwen-14b-spec"},
string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
[](common_params & params) {
params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
params.speculative.n_gpu_layers = 99;
params.port = 8012;
params.n_gpu_layers = 99;
params.flash_attn = true;
params.n_ubatch = 1024;
params.n_batch = 1024;
params.n_ctx = 0;
params.n_cache_reuse = 256;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
return ctx_arg;
}

View File

@@ -60,9 +60,7 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
}
msg.role = message.at("role");
auto has_content = message.contains("content");
auto has_tool_calls = message.contains("tool_calls");
if (has_content) {
if (message.contains("content")) {
const auto & content = message.at("content");
if (content.is_string()) {
msg.content = content;
@@ -83,8 +81,19 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
} else if (!content.is_null()) {
throw std::runtime_error("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
}
} else {
throw std::runtime_error("Expected 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
}
if (has_tool_calls) {
if (message.contains("reasoning_content")) {
msg.reasoning_content = message.at("reasoning_content");
}
if (message.contains("name")) {
msg.tool_name = message.at("name");
}
if (message.contains("tool_call_id")) {
msg.tool_call_id = message.at("tool_call_id");
}
if (message.contains("tool_calls")) {
for (const auto & tool_call : message.at("tool_calls")) {
common_chat_tool_call tc;
if (!tool_call.contains("type")) {
@@ -109,18 +118,6 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
msg.tool_calls.push_back(tc);
}
}
if (!has_content && !has_tool_calls) {
throw std::runtime_error("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
}
if (message.contains("reasoning_content")) {
msg.reasoning_content = message.at("reasoning_content");
}
if (message.contains("name")) {
msg.tool_name = message.at("name");
}
if (message.contains("tool_call_id")) {
msg.tool_call_id = message.at("tool_call_id");
}
msgs.push_back(msg);
}
@@ -445,7 +442,6 @@ std::string common_chat_format_name(common_chat_format format) {
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING: return "Hermes 2 Pro (extract reasoning)";
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return "Command R7B (extract reasoning)";
default:
@@ -879,9 +875,9 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
return data;
}
static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool extract_reasoning) {
static const std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S]*?)<\\|END_THINKING\\|>)([\\s\\S]*)");
static const std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S]*?)<\\|END_ACTION\\|>");
static const std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S]*?)<\\|END_RESPONSE\\|>");
static std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S]*?)<\\|END_THINKING\\|>)([\\s\\S]*)");
static std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S]*?)<\\|END_ACTION\\|>");
static std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S]*?)<\\|END_RESPONSE\\|>");
std::smatch match;
@@ -1013,10 +1009,10 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
}
static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
// TODO: tighten & simplify the parser, don't accept leading text context.
static const std::regex function_regex(
static std::regex function_regex(
"\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: ");
static const std::regex close_regex("\\}\\s*");
static const std::regex builtin_call_regex("<\\|python_tag\\|>\\s*([^.(]+)\\s*\\.\\s*call\\s*\\(\\s*([\\w]+)\\s*=\\s*([\\s\\S]*?)\\)");
static std::regex close_regex("\\}\\s*");
static std::regex builtin_call_regex("<\\|python_tag\\|>\\s*([^.(]+)\\s*\\.\\s*call\\s*\\(\\s*([\\w]+)\\s*=\\s*([\\s\\S]*?)\\)");
if (with_builtin_tools) {
std::smatch match;
@@ -1106,42 +1102,34 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING : COMMON_CHAT_FORMAT_DEEPSEEK_R1;
return data;
}
static common_chat_msg handle_think_tag_prelude(const std::string & input, bool extract_reasoning, const std::function<common_chat_msg(const std::string &)> & rest_parser) {
std::smatch match;
static const std::regex reasoning_content_regex("((?:<think>)?([\\s\\S\\r\\n]*?)</think>)?([\\s\\S\\r\\n]*)");
if (std::regex_match(input, match, reasoning_content_regex)) {
auto rest = match[3].str();
auto msg = rest_parser(rest);
auto reasoning_content = string_strip(match[2].str());
if (extract_reasoning) {
msg.reasoning_content = reasoning_content;
} else if (!reasoning_content.empty()) {
std::ostringstream content;
content << "<think>" << reasoning_content << "</think>" << msg.content;
msg.content = content.str();
}
return msg;
}
return rest_parser(input);
}
static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) {
return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) {
static const std::regex function_regex("<tool▁call▁begin>function<tool▁sep>([^\n]+)\n```json\n");
static const std::regex close_regex("```[\\s\\r\\n]*<tool▁call▁end>");
static const std::regex tool_calls_regex("[\\s\\r\\n]*(?:<tool▁calls▁begin>|<tool_calls_begin>|<tool calls begin>|<tool\\\\_calls\\\\_begin>)([\\s\\S\\r\\n]*?)<tool▁calls▁end>");
static std::regex function_regex("<tool▁call▁begin>function<tool▁sep>([^\n]+)\n```json\n");
static std::regex close_regex("```[\\s\\r\\n]*<tool▁call▁end>");
static std::regex reasoning_content_regex("((?:<think>)?([\\s\\S\\r\\n]*?)</think>)?([\\s\\S\\r\\n]*)");
static std::regex tool_calls_regex("[\\s\\r\\n]*(?:<tool▁calls▁begin>|<tool_calls_begin>|<tool calls begin>|<tool\\\\_calls\\\\_begin>)([\\s\\S\\r\\n]*?)<tool▁calls▁end>");
common_chat_msg msg;
msg.role = "assistant";
std::smatch match;
if (std::regex_match(input, match, reasoning_content_regex)) {
std::string rest;
if (extract_reasoning) {
msg.reasoning_content = string_strip(match[2].str());
} else {
msg.content = match[1].str();
}
rest = match[3].str();
common_chat_msg msg;
msg.role = "assistant";
std::smatch match;
if (std::regex_search(input, match, tool_calls_regex)) {
if (std::regex_search(rest, match, tool_calls_regex)) {
auto tool_calls = match[1].str();
auto msg2 = parse_json_tool_calls(tool_calls, std::nullopt, function_regex, close_regex);
msg.tool_calls = std::move(msg2.tool_calls);
} else {
msg.content = input;
msg.content += std::string(rest.begin() + rest.find_first_not_of(" \r\n"), rest.end());
}
return msg;
});
} else {
msg.content = input;
}
return msg;
}
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
@@ -1246,8 +1234,8 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
}
static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
static const std::regex function_regex(R"((?:>>>)?(?:assistant<|end_header_id|>\n)?(\w+)\n)");
static const std::regex close_regex(R"($|(?=>>>))");
static std::regex function_regex(R"((?:>>>)?(?:assistant<|end_header_id|>\n)?(\w+)\n)");
static std::regex close_regex(R"($|(?=>>>))");
std::string content;
auto it = input.begin();
@@ -1336,7 +1324,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
}
static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
// This version of Functionary still supports the llama 3.1 tool call format for the python tool.
static const std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
static std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
std::smatch match;
if (std::regex_search(input, match, python_tag_regex)) {
auto code = match[1].str();
@@ -1350,8 +1338,8 @@ static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::s
});
return msg;
}
static const std::regex function_regex(R"(<function=(\w+)>)");
static const std::regex close_regex(R"(</function>)");
static std::regex function_regex(R"(<function=(\w+)>)");
static std::regex close_regex(R"(</function>)");
// TODO: tighten & simplify.
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
}
@@ -1418,8 +1406,6 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
"(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?\\s*\\{\\s*\"", //name\"\\s*:\\s*\"" + escaped_name + "\"",
});
data.preserved_tokens = {
"<think>",
"</think>",
"<tool_call>",
"</tool_call>",
"<function",
@@ -1440,123 +1426,122 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
});
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING : COMMON_CHAT_FORMAT_HERMES_2_PRO;
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
return data;
}
static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input, bool extract_reasoning) {
return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) {
static const std::regex open_regex(
"(?:"
"(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
"(<tool_call>" // match 2 (open_tag)
"|<function_call>"
"|<tool>"
"|<tools>"
"|<response>"
"|<json>"
"|<xml>"
"|<JSON>"
")?"
"(\\s*\\{\\s*\"name\"\\s*:[\\s\\S]*)" // match 3 (named tool call + rest)
")"
"|"
"(?:<function=([^>]+)>" // match 4 (function name)
"|<function name=\"([^\"]+)\">)" // match 5 (function name again)
"([\\s\\S]*)" // match 6 (function arguments + rest)})"
);
static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input) {
const static std::regex open_regex(
"(?:"
"(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
"(<tool_call>" // match 2 (open_tag)
"|<function_call>"
"|<tool>"
"|<tools>"
"|<response>"
"|<json>"
"|<xml>"
"|<JSON>"
")?"
"(\\s*\\{\\s*\"name\"\\s*:[\\s\\S]*)" // match 3 (named tool call + rest)
")"
"|"
"(?:<function=([^>]+)>" // match 4 (function name)
"|<function name=\"([^\"]+)\">)" // match 5 (function name again)
"([\\s\\S]*)" // match 6 (function arguments + rest)})"
);
try {
common_chat_msg msg;
msg.role = "assistant";
try {
std::string::const_iterator it = input.begin();
const std::string::const_iterator end = input.end();
std::smatch match;
common_chat_msg msg;
msg.role = "assistant";
while (it != end) {
if (std::regex_search(it, end, match, open_regex)) {
// Add content before the match
msg.content += std::string(it, match[0].first);
std::string::const_iterator it = input.begin();
const std::string::const_iterator end = input.end();
std::smatch match;
auto block_start = match[1].str();
std::string block_end = block_start.empty() ? "" : "```";
while (it != end) {
if (std::regex_search(it, end, match, open_regex)) {
// Add content before the match
msg.content += std::string(it, match[0].first);
auto open_tag = match[2].str();
std::string close_tag;
auto block_start = match[1].str();
std::string block_end = block_start.empty() ? "" : "```";
if (match[3].matched) {
close_tag = open_tag.empty() ? "" : "</" + open_tag.substr(1);
auto json_it = match[3].first;
json tool_call;
if (parse_json(json_it, end, tool_call) && tool_call.contains("name") && tool_call.contains("arguments")) {
auto open_tag = match[2].str();
std::string close_tag;
msg.tool_calls.emplace_back(process_tool_call(tool_call));
it = json_it; // Move iterator past parsed JSON
if (match[3].matched) {
close_tag = open_tag.empty() ? "" : "</" + open_tag.substr(1);
auto json_it = match[3].first;
json tool_call;
if (parse_json(json_it, end, tool_call) && tool_call.contains("name") && tool_call.contains("arguments")) {
// Handle close tags
consume_spaces(it, end);
if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
throw std::runtime_error("Failed to parse closing tag");
}
consume_spaces(it, end);
if (!block_end.empty() && !parse_literal(it, end, block_end)) {
throw std::runtime_error("Failed to parse block end");
}
consume_spaces(it, end);
} else {
// Not a valid tool call, treat as content
msg.content += std::string(match[0].first, match[0].second);
it = match[0].second;
msg.tool_calls.emplace_back(process_tool_call(tool_call));
it = json_it; // Move iterator past parsed JSON
// Handle close tags
consume_spaces(it, end);
if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
throw std::runtime_error("Failed to parse closing tag");
}
consume_spaces(it, end);
if (!block_end.empty() && !parse_literal(it, end, block_end)) {
throw std::runtime_error("Failed to parse block end");
}
consume_spaces(it, end);
} else {
auto function_name = match[4].str();
if (function_name.empty()) {
function_name = match[5].str();
}
GGML_ASSERT(!function_name.empty());
close_tag = "</function>";
// Start parsing from after the opening tags
auto json_it = match[6].first;
json arguments;
if (parse_json(json_it, end, arguments)) {
msg.tool_calls.emplace_back(process_tool_call({
{"name", function_name},
{"arguments", arguments},
}));
it = json_it; // Move iterator past parsed JSON
// Handle close tags
consume_spaces(it, end);
if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
throw std::runtime_error("Failed to parse closing tag");
}
consume_spaces(it, end);
if (!block_end.empty() && !parse_literal(it, end, block_end)) {
throw std::runtime_error("Failed to parse block end");
}
consume_spaces(it, end);
} else {
// Not a valid tool call, treat as content
msg.content += std::string(match[0].first, match[0].second);
it = match[0].second;
}
// Not a valid tool call, treat as content
msg.content += std::string(match[0].first, match[0].second);
it = match[0].second;
}
} else {
// Add remaining content
msg.content += std::string(it, end);
break;
auto function_name = match[4].str();
if (function_name.empty()) {
function_name = match[5].str();
}
GGML_ASSERT(!function_name.empty());
close_tag = "</function>";
// Start parsing from after the opening tags
auto json_it = match[6].first;
json arguments;
if (parse_json(json_it, end, arguments)) {
msg.tool_calls.emplace_back(process_tool_call({
{"name", function_name},
{"arguments", arguments},
}));
it = json_it; // Move iterator past parsed JSON
// Handle close tags
consume_spaces(it, end);
if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
throw std::runtime_error("Failed to parse closing tag");
}
consume_spaces(it, end);
if (!block_end.empty() && !parse_literal(it, end, block_end)) {
throw std::runtime_error("Failed to parse block end");
}
consume_spaces(it, end);
} else {
// Not a valid tool call, treat as content
msg.content += std::string(match[0].first, match[0].second);
it = match[0].second;
}
}
} else {
// Add remaining content
msg.content += std::string(it, end);
break;
}
return msg;
} catch (const std::exception & e) {
LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what());
common_chat_msg msg;
msg.role = "assistant";
msg.content = input;
return msg;
}
});
return msg;
} catch (const std::exception & e) {
LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what());
common_chat_msg msg;
msg.role = "assistant";
msg.content = input;
return msg;
}
}
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
@@ -1621,11 +1606,6 @@ static common_chat_params common_chat_templates_apply_jinja(
return common_chat_params_init_command_r7b(tmpl, params);
}
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
return common_chat_params_init_hermes_2_pro(tmpl, params);
}
// Use generic handler when mixing tools + JSON schema.
// TODO: support that mix in handlers below.
if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -1647,6 +1627,11 @@ static common_chat_params common_chat_templates_apply_jinja(
return common_chat_params_init_without_tools(tmpl, params);
}
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
if (src.find("<tool_call>") != std::string::npos) {
return common_chat_params_init_hermes_2_pro(tmpl, params);
}
// Functionary v3.1 (w/ tools)
if (src.find("<|start_header_id|>") != std::string::npos
&& src.find("<function=") != std::string::npos) {
@@ -1764,9 +1749,7 @@ common_chat_msg common_chat_parse(const std::string & input, common_chat_format
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
return common_chat_parse_functionary_v3_1_llama_3_1(input);
case COMMON_CHAT_FORMAT_HERMES_2_PRO:
return common_chat_parse_hermes_2_pro(input, /* extract_reasoning= */ false);
case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING:
return common_chat_parse_hermes_2_pro(input, /* extract_reasoning= */ true);
return common_chat_parse_hermes_2_pro(input);
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
return common_chat_parse_firefunction_v2(input);
case COMMON_CHAT_FORMAT_COMMAND_R7B:

View File

@@ -53,7 +53,6 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
COMMON_CHAT_FORMAT_HERMES_2_PRO,
COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
COMMON_CHAT_FORMAT_COMMAND_R7B,
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,

View File

@@ -955,8 +955,8 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}
if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
params.ctx_shift = false;
}
@@ -1033,8 +1033,6 @@ struct common_init_result common_init_from_params(common_params & params) {
if (params.warmup) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
llama_set_warmup(lctx, true);
std::vector<llama_token> tmp;
llama_token bos = llama_vocab_bos(vocab);
llama_token eos = llama_vocab_eos(vocab);
@@ -1062,10 +1060,9 @@ struct common_init_result common_init_from_params(common_params & params) {
if (llama_model_has_decoder(model)) {
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
}
llama_kv_self_clear(lctx);
llama_kv_cache_clear(lctx);
llama_synchronize(lctx);
llama_perf_context_reset(lctx);
llama_set_warmup(lctx, false);
}
iparams.model.reset(model);

View File

@@ -407,6 +407,8 @@ struct common_params {
int32_t i_pos = -1; // position of the passkey in the junk text
// imatrix params
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
int32_t i_chunk = 0; // start processing from this chunk
@@ -418,16 +420,16 @@ struct common_params {
int n_pca_batch = 100;
int n_pca_iterations = 1000;
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
std::string cvector_outfile = "control_vector.gguf";
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
bool spm_infill = false; // suffix/prefix/middle pattern for infill
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
// batched-bench params
bool batched_bench_output_jsonl = false;
// common params
std::string out_file; // output filename for all example programs
};
// call once at the start of a program if it uses libcommon

View File

@@ -11,24 +11,25 @@ struct llama_sampler_llg {
std::string grammar_kind;
std::string grammar_data;
LlgTokenizer * tokenizer;
LlgMatcher * grammar;
LlgConstraint * grammar;
LlgMaskResult llg_res;
bool has_llg_res;
};
static LlgMatcher * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
const char * grammar_data) {
static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
const char * grammar_data) {
LlgConstraintInit cinit;
llg_constraint_init_set_defaults(&cinit, tokenizer);
const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
if (log_level && *log_level) {
cinit.log_stderr_level = atoi(log_level);
}
auto c = llg_new_matcher(&cinit, grammar_kind, grammar_data);
if (llg_matcher_get_error(c)) {
LOG_ERR("llg error: %s\n", llg_matcher_get_error(c));
llg_free_matcher(c);
auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
if (llg_get_error(c)) {
LOG_ERR("llg error: %s\n", llg_get_error(c));
llg_free_constraint(c);
return nullptr;
}
return c;
}
@@ -39,29 +40,39 @@ static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (ctx->grammar) {
llg_matcher_consume_token(ctx->grammar, token);
LlgCommitResult res;
llg_commit_token(ctx->grammar, token, &res);
ctx->has_llg_res = false;
}
}
static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (ctx->grammar) {
const uint32_t * mask = llg_matcher_get_mask(ctx->grammar);
if (mask == nullptr) {
if (llg_matcher_compute_mask(ctx->grammar) == 0) {
mask = llg_matcher_get_mask(ctx->grammar);
if (!ctx->has_llg_res) {
if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
ctx->has_llg_res = true;
} else {
LOG_ERR("llg error: %s\n", llg_matcher_get_error(ctx->grammar));
llg_free_matcher(ctx->grammar);
LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
llg_free_constraint(ctx->grammar);
ctx->grammar = nullptr;
return;
}
}
for (size_t i = 0; i < cur_p->size; ++i) {
auto token = cur_p->data[i].id;
if ((mask[token / 32] & (1 << (token % 32))) == 0) {
cur_p->data[i].logit = -INFINITY;
if (ctx->has_llg_res) {
if (ctx->llg_res.is_stop) {
for (size_t i = 0; i < cur_p->size; ++i) {
if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
cur_p->data[i].logit = -INFINITY;
}
}
} else {
const uint32_t * mask = ctx->llg_res.sample_mask;
for (size_t i = 0; i < cur_p->size; ++i) {
auto token = cur_p->data[i].id;
if ((mask[token / 32] & (1 << (token % 32))) == 0) {
cur_p->data[i].logit = -INFINITY;
}
}
}
}
}
@@ -69,9 +80,14 @@ static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array
static void llama_sampler_llg_reset(llama_sampler * smpl) {
auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (ctx->grammar) {
llg_matcher_reset(ctx->grammar);
if (!ctx->grammar) {
return;
}
auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
llg_free_constraint(ctx->grammar);
ctx->grammar = grammar_new;
ctx->has_llg_res = false;
}
static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
@@ -86,7 +102,7 @@ static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
if (ctx->grammar) {
result_ctx->grammar_kind = ctx->grammar_kind;
result_ctx->grammar_data = ctx->grammar_data;
result_ctx->grammar = llg_clone_matcher(ctx->grammar);
result_ctx->grammar = llg_clone_constraint(ctx->grammar);
result_ctx->tokenizer = llg_clone_tokenizer(ctx->tokenizer);
}
}
@@ -98,7 +114,7 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
const auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (ctx->grammar) {
llg_free_matcher(ctx->grammar);
llg_free_constraint(ctx->grammar);
llg_free_tokenizer(ctx->tokenizer);
}
@@ -223,11 +239,9 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
/* .grammar_data = */ grammar_data,
/* .tokenizer = */ tokenizer,
/* .grammar = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
/* .llg_res = */ {},
/* .has_llg_res = */ false,
};
if (ctx->grammar) {
GGML_ASSERT(((size_t) llama_vocab_n_tokens(vocab) + 31) / 32 * 4 ==
llg_matcher_get_mask_byte_size(ctx->grammar));
}
} else {
*ctx = {
/* .vocab = */ vocab,
@@ -235,12 +249,15 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
/* .grammar_data = */ {},
/* .tokenizer = */ nullptr,
/* .grammar = */ nullptr,
/* .llg_res = */ {},
/* .has_llg_res = */ false,
};
}
return llama_sampler_init(
/* .iface = */ &llama_sampler_llg_i,
/* .ctx = */ ctx);
/* .ctx = */ ctx
);
}
#else

View File

@@ -208,9 +208,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
trigger_patterns_c.data(), trigger_patterns_c.size(),
trigger_tokens.data(), trigger_tokens.size())
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
if (!grmr) {
return nullptr;
}
}
auto * result = new common_sampler {

View File

@@ -173,7 +173,7 @@ llama_tokens common_speculative_gen_draft(
result.reserve(params.n_draft);
if (reuse_n == 0) {
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
prompt.clear();
} else {
@@ -192,14 +192,14 @@ llama_tokens common_speculative_gen_draft(
}
if (reuse_i > 0) {
llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
}
if (reuse_n < (int) prompt.size()) {
llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
prompt.erase(prompt.begin() + reuse_n, prompt.end());
}

View File

@@ -180,8 +180,7 @@ class Model:
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
if len(extra) == 0 and len(missing_files) > 0:
raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
f"Missing tensors: {missing}")
raise ValueError(f"Missing or incomplete model files: {missing_files}")
else:
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
f"Missing tensors: {missing}\n"
@@ -529,8 +528,6 @@ class Model:
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab()
added_tokens_decoder = tokenizer.added_tokens_decoder
for i in range(vocab_size):
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
@@ -540,13 +537,13 @@ class Model:
if token in added_vocab:
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
if not added_tokens_decoder[i].normalized:
if not tokenizer.added_tokens_decoder[i].normalized:
previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
if added_tokens_decoder[i].special or self.does_token_look_special(token):
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
toktypes.append(gguf.TokenType.CONTROL)
else:
# NOTE: this was added for Gemma.
@@ -705,15 +702,6 @@ class Model:
if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
# ref: https://huggingface.co/Xenova/gpt-4o
res = "gpt-4o"
if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f":
# ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k
res = "superbpe"
if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15":
# ref: https://huggingface.co/trillionlabs/Trillion-7B-preview
res = "trillion"
if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
# ref: https://huggingface.co/inclusionAI/Ling-lite
res = "bailingmoe"
if res is None:
logger.warning("\n")
@@ -873,9 +861,6 @@ class Model:
for token_id, token_data in added_tokens_decoder.items():
token_id = int(token_id)
token: str = token_data["content"]
if token_id >= vocab_size:
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
if tokens[token_id] != token.encode("utf-8"):
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
@@ -920,40 +905,6 @@ class Model:
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_rwkv_world(self):
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
vocab_size = self.hparams.get("vocab_size", 65536)
tokens: list[bytes] = ['<s>'.encode("utf-8")]
toktypes: list[int] = [gguf.TokenType.CONTROL]
with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
parts = line.split(' ')
assert len(parts) >= 3
token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
token = token.encode("utf-8") if isinstance(token, str) else token
assert isinstance(token, bytes)
assert len(token) == token_len
token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
tokens.append(token_text.encode("utf-8"))
toktypes.append(gguf.TokenType.NORMAL)
remainder = vocab_size - len(tokens)
assert remainder >= 0
for i in range(len(tokens), vocab_size):
tokens.append(f"[PAD{i}]".encode("utf-8"))
toktypes.append(gguf.TokenType.UNUSED)
self.gguf_writer.add_tokenizer_model("rwkv")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
special_vocab.chat_template = "rwkv-world"
# hack: Add '\n\n' as the EOT token to make it chat normally
special_vocab._set_special_token("eot", 261)
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
@@ -1111,6 +1062,13 @@ class BloomModel(Model):
tensors.append((self.map_tensor_name(name), data_torch))
if name == "word_embeddings.weight":
assert self.tensor_names is not None
# TODO: tie them at runtime, don't duplicate in the model file
if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
return tensors
@@ -1752,25 +1710,6 @@ class LlamaModel(Model):
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("Mistral3ForConditionalGeneration")
class Mistral3Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA
# we need to merge the text_config into the root level of hparams
def __init__(self, *args, **kwargs):
hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
if "text_config" in hparams:
hparams = {**hparams, **hparams["text_config"]}
kwargs["hparams"] = hparams
super().__init__(*args, **kwargs)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
name = name.replace("language_model.", "")
if "multi_modal_projector" in name or "vision_tower" in name:
return []
return super().modify_tensors(data_torch, name, bid)
@Model.register("DeciLMForCausalLM")
class DeciModel(Model):
model_arch = gguf.MODEL_ARCH.DECI
@@ -2275,7 +2214,7 @@ class Qwen2Model(Model):
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
@Model.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
@Model.register("Qwen2VLForConditionalGeneration")
class Qwen2VLModel(Model):
model_arch = gguf.MODEL_ARCH.QWEN2VL
@@ -2428,6 +2367,10 @@ class GPT2Model(Model):
tensors.append((new_name, data_torch))
# note: GPT2 output is tied to (same as) wte in original model
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
return tensors
@@ -2757,26 +2700,21 @@ class CodeShellModel(Model):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(1.0)
_has_tok_embd = False
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
new_name = self.map_tensor_name(name)
# assuming token_embd.weight is seen before output.weight
if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
# even though the tensor file(s) does not contain the word embeddings they are still in the weight map
if self.tensor_names and "transformer.wte.weight" in self.tensor_names:
logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
self.tensor_names.remove("transformer.wte.weight")
elif new_name == tok_embd_name:
self._has_tok_embd = True
tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)]
return [(new_name, data_torch)]
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
assert self.tensor_names is not None
if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
# copy tok_embd.weight to output.weight
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
return tensors
@Model.register("InternLM2ForCausalLM")
@@ -3384,83 +3322,6 @@ class Gemma2Model(Model):
return [(self.map_tensor_name(name), data_torch)]
@Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
class Gemma3Model(Model):
model_arch = gguf.MODEL_ARCH.GEMMA3
has_vision: bool = False
# we need to merge the text_config into the root level of hparams
def __init__(self, *args, **kwargs):
hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
if "text_config" in hparams:
hparams = {**hparams, **hparams["text_config"]}
kwargs["hparams"] = hparams
super().__init__(*args, **kwargs)
if "vision_config" in hparams:
logger.info("Has vision encoder, but it will be ignored")
self.has_vision = True
def write(self):
super().write()
if self.has_vision:
logger.info("NOTE: this script only convert the language model to GGUF")
logger.info(" for the vision model, please use gemma3_convert_encoder_to_gguf.py")
def set_vocab(self):
self._set_vocab_sentencepiece()
self.gguf_writer.add_add_space_prefix(False)
def set_gguf_parameters(self):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
# some default values are not specified in the hparams
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
# both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
assert hparams.get("attn_logit_softcapping") is None
assert hparams.get("final_logit_softcapping") is None
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
if hparams.get("rope_scaling") is not None:
assert hparams["rope_scaling"]["rope_type"] == "linear"
# important: this rope_scaling is only applied for global layers, and not used by 1B model
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
if name.startswith("language_model."):
name = name.replace("language_model.", "")
elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
or name.startswith("multimodal_projector.") or name.startswith("vision_model."): # this is for old HF model, should be removed later
# ignore vision tensors
return []
# remove OOV (out-of-vocabulary) rows in token_embd
if "embed_tokens.weight" in name:
vocab = self._create_vocab_sentencepiece()
tokens = vocab[0]
data_torch = data_torch[:len(tokens)]
# ref code in Gemma3RMSNorm
# output = output * (1.0 + self.weight.float())
if name.endswith("norm.weight"):
data_torch = data_torch + 1
return [(self.map_tensor_name(name), data_torch)]
@Model.register("Starcoder2ForCausalLM")
class StarCoder2Model(Model):
model_arch = gguf.MODEL_ARCH.STARCODER2
@@ -3471,7 +3332,38 @@ class Rwkv6Model(Model):
model_arch = gguf.MODEL_ARCH.RWKV6
def set_vocab(self):
self._set_vocab_rwkv_world()
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
vocab_size = self.hparams.get("vocab_size", 65536)
tokens: list[bytes] = ['<s>'.encode("utf-8")]
toktypes: list[int] = [gguf.TokenType.CONTROL]
with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
parts = line.split(' ')
assert len(parts) >= 3
token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
token = token.encode("utf-8") if isinstance(token, str) else token
assert isinstance(token, bytes)
assert len(token) == token_len
token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
tokens.append(token_text.encode("utf-8"))
toktypes.append(gguf.TokenType.NORMAL)
remainder = vocab_size - len(tokens)
assert remainder >= 0
for i in range(len(tokens), vocab_size):
tokens.append(f"[PAD{i}]".encode("utf-8"))
toktypes.append(gguf.TokenType.UNUSED)
self.gguf_writer.add_tokenizer_model("rwkv")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
special_vocab.chat_template = "rwkv-world"
# hack: Add '\n\n' as the EOT token to make it chat normally
special_vocab._set_special_token("eot", 261)
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
@@ -3557,8 +3449,8 @@ class RWKV6Qwen2Model(Rwkv6Model):
head_size = hidden_size // num_attention_heads
rms_norm_eps = self.hparams["rms_norm_eps"]
intermediate_size = self.hparams["intermediate_size"]
time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32)
time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64)
time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
# RWKV isn't context limited
self.gguf_writer.add_context_length(1048576)
@@ -3593,168 +3485,6 @@ class RWKV6Qwen2Model(Rwkv6Model):
yield (new_name, data)
@Model.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM")
class Rwkv7Model(Model):
model_arch = gguf.MODEL_ARCH.RWKV7
def set_vocab(self):
self._set_vocab_rwkv_world()
def calc_lora_rank(self, hidden_size, exponent, multiplier):
return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
try:
head_size = self.hparams["head_size"]
layer_norm_eps = self.hparams["layer_norm_epsilon"]
except KeyError:
head_size = self.hparams["head_dim"]
layer_norm_eps = self.hparams["norm_eps"]
hidden_size = self.hparams["hidden_size"]
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4)
# ICLR: In-Context-Learning-Rate
try:
lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3)
lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6)
except KeyError:
lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3)
lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6)
# RWKV isn't context limited
self.gguf_writer.add_context_length(1048576)
self.gguf_writer.add_embedding_length(hidden_size)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
self.gguf_writer.add_wkv_head_size(head_size)
self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr)
self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix)
self.gguf_writer.add_gate_lora_rank(lora_rank_gate)
self.gguf_writer.add_feed_forward_length(intermediate_size)
self.gguf_writer.add_file_type(self.ftype)
# required by llama.cpp, unused
self.gguf_writer.add_head_count(0)
lerp_weights: dict[int, dict[str, Tensor]] = {}
lora_needs_transpose: bool = True
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# unify tensor names here to make life easier
name = name.replace("blocks", "layers").replace("ffn", "feed_forward")
name = name.replace("self_attn", "attention").replace("attn", "attention")
name = name.replace("time_mixer.", "")
# lora layer names in fla-hub's impl
if "_lora.lora" in name:
self.lora_needs_transpose = False
name = name.replace("_lora.lora.0.weight", "1.weight")
name = name.replace("_lora.lora.2.weight", "2.weight")
name = name.replace("_lora.lora.2.bias", "0.weight")
name = name.replace("feed_forward_norm", "ln2")
name = name.replace("g_norm", "ln_x")
if "attention.v" in name and "value" not in self.map_tensor_name(name) and bid == 0:
# some models have dummy v0/v1/v2 on first layer while others don't
# ignore them all since they are not used
return
wkv_has_gate = self.hparams.get("wkv_has_gate", True)
lerp_list = ["r", "w", "k", "v", "a", "g"] if wkv_has_gate else ["r", "w", "k", "v", "a"]
if bid is not None and "attention.x_" in name:
if "attention.x_x" in name:
# already concatenated
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
data = data_torch.reshape(len(lerp_list), 1, 1, -1)
yield (new_name, data)
else:
try:
self.lerp_weights[bid][name] = data_torch
except KeyError:
self.lerp_weights[bid] = {name: data_torch}
if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in lerp_list):
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"] for i in lerp_list], dim=0)
yield (new_name, data)
return
else:
data_torch = data_torch.squeeze()
new_name = self.map_tensor_name(name)
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
new_name += ".weight"
if self.lora_needs_transpose and any(
new_name.endswith(t) for t in [
"time_mix_w1.weight", "time_mix_w2.weight",
"time_mix_a1.weight", "time_mix_a2.weight",
"time_mix_v1.weight", "time_mix_v2.weight",
"time_mix_g1.weight", "time_mix_g2.weight",
]
):
data_torch = data_torch.transpose(0, 1)
if 'r_k' in new_name:
data_torch = data_torch.flatten()
if bid == 0 and "time_mix_a" in new_name:
# dummy v0/v1/v2 on first layer
# easist way to make llama happy
yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch)
yield (new_name, data_torch)
@Model.register("RwkvHybridForCausalLM")
class ARwkv7Model(Rwkv7Model):
model_arch = gguf.MODEL_ARCH.ARWKV7
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
except FileNotFoundError:
self._set_vocab_gpt2()
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
hidden_size = self.hparams["hidden_size"]
head_size = self.hparams["head_size"]
rms_norm_eps = self.hparams["rms_norm_eps"]
intermediate_size = self.hparams["intermediate_size"]
wkv_has_gate = self.hparams["wkv_has_gate"]
assert self.hparams["wkv_version"] == 7
# ICLR: In-Context-Learning-Rate
lora_rank_decay = 64
lora_rank_iclr = 64
lora_rank_value_residual_mix = 32
lora_rank_gate = 128 if wkv_has_gate else 0
# RWKV isn't context limited
self.gguf_writer.add_context_length(1048576)
self.gguf_writer.add_embedding_length(hidden_size)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_wkv_head_size(head_size)
self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr)
self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix)
self.gguf_writer.add_gate_lora_rank(lora_rank_gate)
self.gguf_writer.add_feed_forward_length(intermediate_size)
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_token_shift_count(1)
# required by llama.cpp, unused
self.gguf_writer.add_head_count(0)
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
class MambaModel(Model):
model_arch = gguf.MODEL_ARCH.MAMBA
@@ -3809,6 +3539,8 @@ class MambaModel(Model):
_tok_embd = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
@@ -3818,10 +3550,6 @@ class MambaModel(Model):
logger.debug("A_log --> A ==> " + new_name)
data_torch = -torch.exp(data_torch)
# [4 1 8192 1] -> [4 8192 1 1]
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
data_torch = data_torch.squeeze()
# assuming token_embd.weight is seen before output.weight
if self._tok_embd is not None and new_name == output_name:
if torch.equal(self._tok_embd, data_torch):
@@ -4425,29 +4153,6 @@ class DeepseekV2Model(Model):
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("PLMForCausalLM")
class PLMModel(Model):
model_arch = gguf.MODEL_ARCH.PLM
def set_vocab(self):
self._set_vocab_gpt2()
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length(hparams["v_head_dim"])
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
return [(self.map_tensor_name(name), data_torch)]
def prepare_tensors(self):
super().prepare_tensors()
@Model.register("T5WithLMHeadModel")
@Model.register("T5ForConditionalGeneration")
@Model.register("MT5ForConditionalGeneration")
@@ -5136,108 +4841,6 @@ class GraniteMoeModel(GraniteModel):
return super().modify_tensors(data_torch, name, bid)
@Model.register("BailingMoeForCausalLM")
class BailingMoeModel(Model):
model_arch = gguf.MODEL_ARCH.BAILINGMOE
def set_vocab(self):
self._set_vocab_gpt2()
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
if hparams.get("head_dim"):
rope_dim = hparams["head_dim"]
else:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(rope_dim)
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_weights_scale(1.0)
self.gguf_writer.add_expert_count(hparams["num_experts"])
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
_experts: list[dict[str, Tensor]] | None = None
@staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
n_embd = self.hparams["hidden_size"]
head_dim = self.hparams.get("head_dim", n_embd // n_head)
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
if name.endswith("attention.dense.weight"):
return [(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch)]
elif name.endswith("query_key_value.weight"):
q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
return [
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeModel.permute(q, n_head, n_head)),
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeModel.permute(k, n_head, n_kv_head)),
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v)
]
elif name.find("mlp.experts") != -1:
n_experts = self.hparams["num_experts"]
assert bid is not None
tensors: list[tuple[str, Tensor]] = []
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
new_name = self.map_tensor_name(name)
if new_name == output_name and self.hparams.get("norm_head"):
data_torch = data_torch.float()
data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7
return [(new_name, data_torch)]
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("ChameleonForConditionalGeneration")
@Model.register("ChameleonForCausalLM") # obsolete
class ChameleonModel(Model):
@@ -5491,7 +5094,7 @@ def main() -> None:
logger.error(f"Model {model_architecture} is not supported")
sys.exit(1)
model_instance = model_class(dir_model, output_type, fname_out,
model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
eager=args.no_lazy,
metadata_override=args.metadata, model_name=args.model_name,

View File

@@ -110,9 +110,6 @@ models = [
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
{"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
{"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
]

View File

@@ -20,7 +20,7 @@
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
@@ -227,19 +227,30 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:
**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
```sh
git clone https://github.com/oneapi-src/oneDNN.git
cd oneDNN
cmake -GNinja -Bbuild-nvidia -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP -DDNNL_GPU_VENDOR=NVIDIA -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
cmake --build build-nvidia --config Release
git clone https://github.com/oneapi-src/oneMKL
cd oneMKL
cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
cmake --build buildWithCublas --config Release
```
- **Adding support to AMD GPUs**
**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
```sh
git clone https://github.com/oneapi-src/oneMKL
cd oneMKL
# Find your HIPTARGET with rocminfo, under the key 'Name:'
cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
cmake --build buildWithrocBLAS --config Release
```
3. **Verify installation and environment**
In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
@@ -304,19 +315,22 @@ cmake --build build --config Release -j -v
#### Nvidia GPU
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
```sh
# Export relevant ENV variables
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with Nvidia BLAS acceleration through SYCL
# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
# build all binary
cmake --build build --config Release -j -v
@@ -324,10 +338,12 @@ cmake --build build --config Release -j -v
#### AMD GPU
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
```sh
# Export relevant ENV variables
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with rocBLAS acceleration through SYCL
## AMD
@@ -644,9 +660,8 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|--------------------|---------------------------------------|---------------------------------------------|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
@@ -656,7 +671,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |

View File

@@ -132,14 +132,12 @@ You may find the official downloads here: [NVIDIA developer site](https://develo
#### Compile and run inside a Fedora Toolbox Container
We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
**Recommended for:**
- ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
- (there are no supported CUDA packages for these systems)
- ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads).
- (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system)
- ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean.
- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
- *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
@@ -191,7 +189,7 @@ The following compilation options are also available to tweak performance:
| Option | Legal values | Default | Description |
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
@@ -199,54 +197,29 @@ The following compilation options are also available to tweak performance:
## MUSA
This provides GPU acceleration using a Moore Threads GPU. Make sure to have the [MUSA SDK](https://developer.mthreads.com/musa/musa-sdk) installed.
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
#### Download directly from Moore Threads
- Using `CMake`:
You may find the official downloads here: [Moore Threads developer site](https://developer.mthreads.com/sdk/download/musa).
### Compilation
```bash
cmake -B build -DGGML_MUSA=ON
cmake --build build --config Release
```
#### Override Compute Capability Specifications
By default, all supported compute capabilities are enabled. To customize this behavior, you can specify the `MUSA_ARCHITECTURES` option in the CMake command:
```bash
cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
cmake --build build --config Release
```
This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
#### Compilation options
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
- For static builds, add `-DBUILD_SHARED_LIBS=OFF` and `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`:
```bash
cmake -B build -DGGML_MUSA=ON
cmake --build build --config Release
```
For static build:
```bash
cmake -B build -DGGML_MUSA=ON \
-DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
cmake --build build --config Release
```
### Runtime MUSA environmental variables
You may set the [musa environmental variables](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) at runtime.
```bash
# Use `MUSA_VISIBLE_DEVICES` to hide the first compute device.
MUSA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
```
### Unified Memory
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
## HIP
This provides GPU acceleration on HIP-supported AMD GPUs.
@@ -436,26 +409,6 @@ llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
## Arm® KleidiAI™
KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend.
To enable KleidiAI, go to the llama.cpp directory and build using CMake
```bash
cmake -B build -DGGML_CPU_KLEIDIAI=ON
cmake --build build --config Release
```
You can verify that KleidiAI is being used by running
```bash
./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
```
If KleidiAI is enabled, the ouput will contain a line similar to:
```
load_tensors: CPU_KLEIDIAI model buffer size = 3474.00 MiB
```
KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`.
Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.
## Android
To read documentation for how to build on Android, [click here](./android.md)

View File

@@ -14,7 +14,9 @@ In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox
- [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment)
- [Installing Essential Development Tools](#installing-essential-development-tools)
- [Adding the CUDA Repository](#adding-the-cuda-repository)
- [Installing Nvidia Driver Libraries](#installing-nvidia-driver-libraries)
- [Installing `nvidia-driver-libs`](#installing-nvidia-driver-libs)
- [Manually Resolving Package Conflicts](#manually-resolving-package-conflicts)
- [Finalizing the Installation of `nvidia-driver-libs`](#finalizing-the-installation-of-nvidia-driver-libs)
- [Installing the CUDA Meta-Package](#installing-the-cuda-meta-package)
- [Configuring the Environment](#configuring-the-environment)
- [Verifying the Installation](#verifying-the-installation)
@@ -65,7 +67,7 @@ This guide focuses on Fedora hosts, but with small adjustments, it can work for
sudo dnf distro-sync
```
2. **Install **Vim** the default text editor (Optional):**
2. **Install the Default Text Editor (Optional):**
```bash
sudo dnf install vim-default-editor --allowerasing
@@ -95,48 +97,36 @@ After adding the repository, synchronize the package manager again:
sudo dnf distro-sync
```
## Installing Nvidia Driver Libraries
## Installing `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
First, we need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go):
We need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go).
```bash
ls -la /usr/lib64/libcuda.so.1
```
### If *`libcuda.so.1`* is missing:
```
ls: cannot access '/usr/lib64/libcuda.so.1': No such file or directory
```
**Explanation:**
The host dose not supply the CUDA drivers, **install them now:**
#### Install the Nvidia Driver Libraries on Guest:
- `nvidia-driver-libs` and `nvidia-driver-cuda-libs` contains necessary NVIDIA driver libraries required by CUDA,
on hosts with NVIDIA drivers installed the Fedora Container will supply the host libraries.
### Install Nvidia Driver Libraries on Guest (if `libcuda.so.1` was NOT found).
```bash
sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
```
### If *`libcuda.so.1`* exists:
```
lrwxrwxrwx. 1 root root 21 Mar 24 11:26 /usr/lib64/libcuda.so.1 -> libcuda.so.570.133.07
```
### Manually Updating the RPM database for host-supplied NVIDIA drivers (if `libcuda.so.1` was found).
**Explanation:**
The host is supply the CUDA drivers, **we need to update the guest RPM Database accordingly:**
If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files.
#### Update the Toolbox RPM Database to include the Host-Supplied Libraries:
Note: we do not actually install the libraries, we just update the DB so that the guest system knows they are supplied by the host.
##### 1. Download `nvidia-` parts that are supplied by the host RPM's (with dependencies)
#### 1. Download `nvidia-driver-libs` and `nvidia-driver-cuda-libs` RPM's (with dependencies)
```bash
sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-libs nvidia-driver-cuda-libs
```
##### 2. Update the RPM database to assume the installation of these packages.
#### 2. Update the RPM database to assume the installation of these packages.
```bash
sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*
@@ -144,26 +134,23 @@ sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*
**Note:**
- The `--justdb` option only updates the RPM database, without touching the filesystem elsewhere.
- The `--justdb` option only updates the RPM database, without touching the filesystem.
##### Check that the RPM Database has been correctly updated:
**Note:** This is the same command as in the *"Install the Nvidia Driver Libraries on Guest"* for if *`libcuda.so.1`* was missing.
#### Finalizing the Installation of `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
After manually installing the dependencies, run:
```bash
sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
```
*(this time it will not install anything, as the database things that these packages are already installed)*
You should receive a message indicating the package is already installed:
```
Updating and loading repositories:
Repositories loaded.
Package "nvidia-driver-cuda-3:570.124.06-1.fc41.x86_64" is already installed.
Package "nvidia-driver-libs-3:570.124.06-1.fc41.x86_64" is already installed.
Package "nvidia-driver-cuda-libs-3:570.124.06-1.fc41.x86_64" is already installed.
Package "nvidia-persistenced-3:570.124.06-1.fc41.x86_64" is already installed.
Package "nvidia-driver-libs-3:570.86.10-1.fc41.x86_64" is already installed.
Package "nvidia-driver-cuda-libs-3:570.86.10-1.fc41.x86_64" is already installed.
Nothing to do.
```
@@ -220,9 +207,9 @@ You should see output similar to:
```
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Fri_Feb_21_20:23:50_PST_2025
Cuda compilation tools, release 12.8, V12.8.93
Build cuda_12.8.r12.8/compiler.35583870_0
Built on Wed_Jan_15_19:20:09_PST_2025
Cuda compilation tools, release 12.8, V12.8.61
Build cuda_12.8.r12.8/compiler.35404655_0
```
This output confirms that the CUDA compiler is accessible and indicates the installed version.

View File

@@ -9,13 +9,6 @@ brew install llama.cpp
```
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668
## MacPorts
```sh
sudo port install llama.cpp
```
see also: https://ports.macports.org/port/llama.cpp/details/
## Nix
On Mac and Linux, the Nix package manager can be used via

View File

@@ -132,7 +132,7 @@ int main(int argc, char ** argv) {
const auto t_pp_start = ggml_time_us();
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_ERR("%s: llama_decode() failed\n", __func__);
@@ -141,7 +141,7 @@ int main(int argc, char ** argv) {
if (is_pp_shared) {
for (int32_t i = 1; i < pl; ++i) {
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
}

View File

@@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 {
}
for i in 1 ..< n_parallel {
llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
}
if n_parallel > 1 {

View File

@@ -342,7 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
}
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
@@ -394,8 +394,6 @@ static int prepare_entries(common_params & params, train_context & ctx_train) {
int main(int argc, char ** argv) {
common_params params;
params.out_file = "control_vector.gguf";
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
return 1;
}
@@ -500,7 +498,7 @@ int main(int argc, char ** argv) {
}
// write output vectors to gguf
export_gguf(ctx_train.v_final, params.out_file, model_hint);
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
llama_backend_free();

View File

@@ -38,7 +38,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
const struct llama_model * model = llama_get_model(ctx);
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
// run model
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);

View File

@@ -413,22 +413,20 @@ static void print_usage(int, char ** argv) {
int main(int argc, char ** argv) {
common_params params;
params.out_file = "ggml-lora-merged-f16.gguf";
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
return 1;
}
g_verbose = (params.verbosity > 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());
exit(EXIT_FAILURE);
}
printf("done, output file is %s\n", params.out_file.c_str());
printf("done, output file is %s\n", params.lora_outfile.c_str());
return 0;
}

View File

@@ -45,7 +45,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
}
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, true);
llama_set_causal_attn(ctx, false);
@@ -102,7 +102,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
llama_token eos_token = llama_vocab_eos(vocab);
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, false);
llama_set_causal_attn(ctx, true);

View File

@@ -206,6 +206,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
void IMatrixCollector::save_imatrix(int ncall) const {
auto fname = m_params.out_file;
if (fname.empty()) {
fname = "imatrix.dat";
}
if (ncall > 0) {
fname += ".at_";
@@ -495,7 +498,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
llama_batch batch = llama_batch_init(n_batch, 0, 1);
@@ -580,8 +583,6 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
int main(int argc, char ** argv) {
common_params params;
params.out_file = "imatrix.dat" ;
params.n_ctx = 512;
params.logits_all = true;
params.escape = false;

View File

@@ -332,8 +332,8 @@ int main(int argc, char ** argv) {
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
n_past -= n_discard;

View File

@@ -1578,7 +1578,7 @@ int main(int argc, char ** argv) {
test t(inst, lmodel, ctx);
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
// cool off before the test
if (params.delay) {
@@ -1618,7 +1618,7 @@ int main(int argc, char ** argv) {
}
for (int i = 0; i < params.reps; i++) {
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
uint64_t t_start = get_time_ns();

View File

@@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
}
batch->logits[batch->n_tokens - 1] = true;
llama_kv_self_clear(context);
llama_kv_cache_clear(context);
const auto t_pp_start = ggml_time_us();
if (llama_decode(context, *batch) != 0) {
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
LOGi("Benchmark text generation (tg)");
llama_kv_self_clear(context);
llama_kv_cache_clear(context);
const auto t_tg_start = ggml_time_us();
for (i = 0; i < tg; i++) {
@@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
const auto t_tg_end = ggml_time_us();
llama_kv_self_clear(context);
llama_kv_cache_clear(context);
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
@@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
}

View File

@@ -16,7 +16,7 @@ Open `llama.swiftui.xcodeproj` project in Xcode and you should be able to build
a simulator or a real device.
To use the framework with a different project, the XCFramework can be added to the project by
adding `build-apple/llama.xcframework` by dragging and dropping it into the project navigator, or
adding `build-ios/llama.xcframework` by dragging and dropping it into the project navigator, or
by manually selecting the framework in the "Frameworks, Libraries, and Embedded Content" section
of the project settings.

View File

@@ -210,7 +210,7 @@ actor LlamaContext {
}
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
llama_kv_self_clear(context)
llama_kv_cache_clear(context)
let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
@@ -223,7 +223,7 @@ actor LlamaContext {
// bench text generation
llama_kv_self_clear(context)
llama_kv_cache_clear(context)
let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
@@ -242,7 +242,7 @@ actor LlamaContext {
let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
llama_kv_self_clear(context)
llama_kv_cache_clear(context)
let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
@@ -292,7 +292,7 @@ actor LlamaContext {
func clear() {
tokens_list.removeAll()
temporary_invalid_cchars.removeAll()
llama_kv_self_clear(context)
llama_kv_cache_clear(context)
}
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {

View File

@@ -51,13 +51,6 @@ install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-gemma3-cli)
add_executable(${TARGET} gemma3-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-llava-clip-quantize-cli)
add_executable(${TARGET} clip-quantize-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)

View File

@@ -1,30 +0,0 @@
# Gemma 3 vision
> [!IMPORTANT]
>
> This is very experimental, only used for demo purpose.
## How to get mmproj.gguf?
```bash
cd gemma-3-4b-it
python ../llama.cpp/examples/llava/gemma3_convert_encoder_to_gguf.py .
# output file is mmproj.gguf
```
## How to run it?
What you need:
- The text model GGUF, can be converted using `convert_hf_to_gguf.py`
- The mmproj file from step above
- An image file
```bash
# build
cmake -B build
cmake --build build --target llama-gemma3-cli
# run it
./build/bin/llama-gemma3-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
```

View File

@@ -5,25 +5,13 @@ Currently, this readme only supports minicpm-omni's image capabilities, and we w
Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder.
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
git clone git@github.com:OpenBMB/llama.cpp.git
cd llama.cpp
git checkout minicpm-omni
```
Build llama.cpp using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```
### Usage of MiniCPM-o 2.6
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
@@ -34,15 +22,25 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
# quantize int4 version
./build/bin/llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
./llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Build llama.cpp using `CMake`:
https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md
```bash
cmake -B build
cmake --build build --config Release
```
Inference on Linux or Mac
```bash
```
# run f16 version
./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run quantized int4 version
./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# or run in interactive mode
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
```

View File

@@ -4,26 +4,13 @@
Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```
Build llama.cpp using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```
### Usage of MiniCPM-Llama3-V 2.5
### Usage
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
@@ -33,15 +20,80 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
# quantize int4 version
./build/bin/llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Build for Linux or Mac
```bash
make
make llama-minicpmv-cli
```
Inference on Linux or Mac
```bash
```
# run f16 version
./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run quantized int4 version
./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# or run in interactive mode
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
```
### Android
#### Build on Android device using Termux
We found that build on Android device would bring better runtime performance, so we recommend to build on device.
[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
Install tools in Termux:
```
apt update && apt upgrade -y
apt install git make cmake
```
It's recommended to move your model inside the `~/` directory for best performance:
```
cd storage/downloads
mv model.gguf ~/
```
#### Building the Project using Android NDK
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
```bash
mkdir build-android
cd build-android
export NDK=/your_ndk_path
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
make
```
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
```
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
$cd /data/data/com.termux/files/home/bin
$chmod +x ./*
```
Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
```
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
```
Now, you can start chatting:
```
$cd /data/data/com.termux/files/home/bin
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
```

View File

@@ -4,25 +4,13 @@
Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
git clone git@github.com:OpenBMB/llama.cpp.git
cd llama.cpp
git checkout minicpmv-main
```
Build llama.cpp using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```
### Usage of MiniCPM-V 2.6
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
@@ -33,15 +21,87 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
# quantize int4 version
./build/bin/llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Build for Linux or Mac
```bash
make
make llama-minicpmv-cli
```
Inference on Linux or Mac
```bash
```
# run f16 version
./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run quantized int4 version
./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# or run in interactive mode
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
```
### Video
Install FFmpeg
```
brew install ffmpeg
brew install pkg-config
```
### Android
#### Build on Android device using Termux
We found that build on Android device would bring better runtime performance, so we recommend to build on device.
[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
Install tools in Termux:
```
apt update && apt upgrade -y
apt install git make cmake
```
It's recommended to move your model inside the `~/` directory for best performance:
```
cd storage/downloads
mv model.gguf ~/
```
#### Building the Project using Android NDK
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
```bash
mkdir build-android
cd build-android
export NDK=/your_ndk_path
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
make
```
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
```
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
$cd /data/data/com.termux/files/home/bin
$chmod +x ./*
```
Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
```
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
```
Now, you can start chatting:
```
$cd /data/data/com.termux/files/home/bin
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
```

View File

@@ -4,12 +4,31 @@
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h"
#include "ggml.h"
#include "ggml-cpp.h"
#include "ggml-cpu.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "gguf.h"
//#ifdef GGML_USE_CUDA
//#include "ggml-cuda.h"
//#endif
//
//#ifdef GGML_USE_SYCL
//#include "ggml-sycl.h"
//#endif
//
//#ifdef GGML_USE_METAL
//#include "ggml-metal.h"
//#endif
//
//#ifdef GGML_USE_CANN
//#include "ggml-cann.h"
//#endif
//
//#ifdef GGML_USE_VULKAN
//#include "ggml-vulkan.h"
//#endif
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
@@ -136,8 +155,6 @@ static std::string format(const char * fmt, ...) {
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
#define TN_IMAGE_NEWLINE "model.image_newline"
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
#define TN_MINICPMV_QUERY "resampler.query"
@@ -164,7 +181,6 @@ enum projector_type {
PROJECTOR_TYPE_RESAMPLER,
PROJECTOR_TYPE_GLM_EDGE,
PROJECTOR_TYPE_MERGER,
PROJECTOR_TYPE_GEMMA3,
PROJECTOR_TYPE_UNKNOWN,
};
@@ -175,7 +191,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
};
@@ -302,7 +317,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
return kv.first;
}
}
throw std::runtime_error(format("Unknown projector type: %s", name.c_str()));
return PROJECTOR_TYPE_UNKNOWN;
}
#ifdef CLIP_DEBUG_FUNCTIONS
@@ -559,10 +574,6 @@ struct clip_vision_model {
struct ggml_tensor * mm_model_ln_kv_b;
struct ggml_tensor * mm_model_ln_post_w;
struct ggml_tensor * mm_model_ln_post_b;
// gemma3
struct ggml_tensor * mm_input_proj_w;
struct ggml_tensor * mm_soft_emb_norm_w;
};
struct clip_ctx {
@@ -577,7 +588,7 @@ struct clip_ctx {
struct clip_vision_model vision_model;
projector_type proj_type = PROJECTOR_TYPE_MLP;
int32_t max_feature_layer; // unused in newer models like gemma3
int32_t max_feature_layer;
float image_mean[3];
float image_std[3];
bool use_gelu = false;
@@ -589,209 +600,21 @@ struct clip_ctx {
bool has_post_norm = false;
bool has_patch_bias = false;
struct gguf_context * ctx_gguf = nullptr;
struct ggml_context * ctx_data = nullptr;
struct gguf_context * ctx_gguf;
struct ggml_context * ctx_data;
std::vector<uint8_t> buf_compute_meta;
std::vector<ggml_backend_t> backend_ptrs;
std::vector<ggml_backend_buffer_type_t> backend_buft;
// memory buffers to evaluate the model
ggml_backend_buffer_t params_buffer = NULL;
ggml_backend_t backend = nullptr;
ggml_backend_t backend_cpu = nullptr;
ggml_backend_buffer_t buf = nullptr;
ggml_backend_t backend = NULL;
ggml_gallocr_t compute_alloc = NULL;
ggml_backend_sched_ptr sched;
struct clip_image_size * load_image_size = nullptr;
clip_ctx(clip_context_params & ctx_params) {
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
backend = ctx_params.use_gpu
? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
: nullptr;
if (backend) {
LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
backend_ptrs.push_back(backend);
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
} else {
backend = backend_cpu;
LOG_INF("%s: CLIP using CPU backend\n", __func__);
}
backend_ptrs.push_back(backend_cpu);
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
sched.reset(
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
);
}
~clip_ctx() {
ggml_free(ctx_data);
gguf_free(ctx_gguf);
ggml_backend_buffer_free(buf);
ggml_backend_free(backend);
if (backend_cpu != backend) {
ggml_backend_free(backend_cpu);
}
}
struct clip_image_size * load_image_size;
};
static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;
const int image_size = hparams.image_size;
int image_size_width = image_size;
int image_size_height = image_size;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int n_layer = hparams.n_layer;
const float eps = hparams.eps;
GGML_ASSERT(imgs->size == 1); // batch_size == 1
struct ggml_init_params params = {
/*.mem_size =*/ ctx->buf_compute_meta.size(),
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
/*.no_alloc =*/ true,
};
struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
// input raw
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
inp = ggml_add(ctx0, inp, model.patch_bias);
// position embeddings
struct ggml_tensor * embeddings = ggml_add(ctx0, inp, model.position_embeddings);
// loop over layers
for (int il = 0; il < n_layer; il++) {
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
// layernorm1
{
cur = ggml_norm(ctx0, cur, eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), model.layers[il].ln_1_b);
}
// self-attention
{
struct ggml_tensor * Q =
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
struct ggml_tensor * K =
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
struct ggml_tensor * V =
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
KQ = ggml_soft_max_inplace(ctx0, KQ);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
}
// attention output
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, embeddings);
embeddings = cur; // embeddings = residual, cur = hidden_states
// layernorm2
{
cur = ggml_norm(ctx0, cur, eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
}
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
// siglip uses gelu
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
// residual 2
cur = ggml_add(ctx0, embeddings, cur);
embeddings = cur;
}
// post-layernorm
if (ctx->has_post_norm) {
embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "post_ln");
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
}
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
const int batch_size = 1;
const int mm_tokens_per_image = 256; // default value for gemma3
const int tokens_per_side = sqrt(mm_tokens_per_image);
const int patches_per_image = sqrt(num_patches);
const int kernel_size = patches_per_image / tokens_per_side;
embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size);
// doing a pool2d to reduce the number of output tokens to 256
embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size);
embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
// apply norm before projection
embeddings = ggml_rms_norm(ctx0, embeddings, eps);
embeddings = ggml_mul(ctx0, embeddings, model.mm_soft_emb_norm_w);
// apply projection
embeddings = ggml_mul_mat(ctx0,
ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
embeddings);
}
// build the graph
ggml_build_forward_expand(gf, embeddings);
ggml_free(ctx0);
return gf;
}
static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
if (!ctx->has_vision_encoder) {
LOG_ERR("This gguf file seems to have no vision encoder\n");
return nullptr;
@@ -1337,8 +1160,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
} else {
GGML_ABORT("fatel error");
}
}
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
@@ -1360,25 +1182,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
return gf;
}
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
return clip_image_build_graph_siglip(ctx, imgs);
} else {
// TODO: we should have one build_* function per model
return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
}
}
// read and create ggml_context containing the tensors and their data
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
return clip_init(fname, clip_context_params{
/* use_gpu */ true,
/* verbosity */ verbosity,
});
}
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
int verbosity = ctx_params.verbosity;
struct ggml_context * meta = NULL;
struct gguf_init_params params = {
@@ -1396,16 +1201,14 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
const int n_kv = gguf_get_n_kv(ctx);
const int ftype = get_u32(ctx, KEY_FTYPE);
const std::string ftype_str = get_ftype(ftype);
const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION);
const std::string description = gguf_get_val_str(ctx, idx_desc);
const int idx_name = gguf_find_key(ctx, KEY_NAME);
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
const std::string name = gguf_get_val_str(ctx, idx_name);
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
}
const int idx_desc = gguf_find_key(ctx, KEY_DESCRIPTION);
if (idx_desc != -1) { // ditto
const std::string description = gguf_get_val_str(ctx, idx_desc);
LOG_INF("%s: description: %s\n", __func__, description.c_str());
}
LOG_INF("%s: description: %s\n", __func__, description.c_str());
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
@@ -1474,7 +1277,7 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
}
}
clip_ctx * new_clip = new clip_ctx(ctx_params);
clip_ctx * new_clip = new clip_ctx{};
// update projector type
{
@@ -1493,6 +1296,36 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
}
}
//#ifdef GGML_USE_CUDA
// new_clip->backend = ggml_backend_cuda_init(0);
// LOG_INF("%s: CLIP using CUDA backend\n", __func__);
//#endif
//
//#ifdef GGML_USE_METAL
// new_clip->backend = ggml_backend_metal_init();
// LOG_INF("%s: CLIP using Metal backend\n", __func__);
//#endif
//
//#ifdef GGML_USE_CANN
// new_clip->backend = ggml_backend_cann_init(0);
// LOG_INF("%s: CLIP using CANN backend\n", __func__);
//#endif
//
//#ifdef GGML_USE_VULKAN
// new_clip->backend = ggml_backend_vk_init(0);
// LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
//#endif
//
//#ifdef GGML_USE_SYCL
// new_clip->backend = ggml_backend_sycl_init(0);
// LOG_INF("%s: CLIP using SYCL backend\n", __func__);
//#endif
if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init();
LOG_INF("%s: CLIP using CPU backend\n", __func__);
}
// model size and capabilities
{
int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
@@ -1530,12 +1363,8 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
GGML_ASSERT(new_clip->has_vision_encoder);
GGML_ASSERT(!new_clip->has_text_encoder);
try {
idx = get_key_idx(ctx, KEY_USE_GELU);
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
} catch (std::runtime_error & /*e*/) {
new_clip->use_gelu = false;
}
idx = get_key_idx(ctx, KEY_USE_GELU);
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
try {
idx = get_key_idx(ctx, KEY_USE_SILU);
@@ -1549,7 +1378,6 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
LOG_INF("%s: minicpmv_version: %d\n", __func__, new_clip->minicpmv_version);
LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector);
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
@@ -1592,9 +1420,7 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
}
// alloc memory and offload data
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend);
new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft);
ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
@@ -1607,7 +1433,7 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
return nullptr;
}
int num_bytes = ggml_nbytes(cur);
if (ggml_backend_buft_is_host(buft)) {
if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
// for the CPU and Metal backend, we can read directly into the tensor
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
} else {
@@ -1743,17 +1569,11 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
}
try {
vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
} catch(const std::exception& /*e*/) {
vision_model.patch_embeddings_0 = nullptr;
}
try {
vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
} catch(const std::exception& /*e*/) {
vision_model.position_embeddings = nullptr;
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
}
try {
vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
} catch(const std::exception& /*e*/) {
@@ -1864,10 +1684,6 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
}
else if (new_clip->proj_type == PROJECTOR_TYPE_GEMMA3) {
vision_model.mm_input_proj_w = get_tensor(new_clip->ctx_data, TN_MM_INP_PROJ);
vision_model.mm_soft_emb_norm_w = get_tensor(new_clip->ctx_data, TN_MM_SOFT_EMB_N);
}
else {
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@@ -1903,21 +1719,14 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
// measure mem requirement and allocate
{
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
clip_image_f32_batch batch;
batch.size = 1;
batch.data = nullptr;
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
ggml_backend_sched_reserve(new_clip->sched.get(), gf);
for (size_t i = 0; i < new_clip->backend_ptrs.size(); ++i) {
ggml_backend_t backend = new_clip->backend_ptrs[i];
ggml_backend_buffer_type_t buft = new_clip->backend_buft[i];
size_t size = ggml_backend_sched_get_buffer_size(new_clip->sched.get(), backend);
if (size > 1) {
LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
ggml_backend_buft_name(buft),
size / 1024.0 / 1024.0);
}
}
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
}
return new_clip;
@@ -2409,7 +2218,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
return true;
}
if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
if (ctx->has_glm_projector) {
res_imgs->size = 1;
res_imgs->data = new clip_image_f32[res_imgs->size];
clip_image_u8 resized_image;
@@ -2598,6 +2407,12 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
}
void clip_free(clip_ctx * ctx) {
ggml_free(ctx->ctx_data);
gguf_free(ctx->ctx_gguf);
ggml_backend_buffer_free(ctx->params_buffer);
ggml_backend_free(ctx->backend);
ggml_gallocr_free(ctx->compute_alloc);
delete ctx;
}
@@ -2793,9 +2608,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
// build the inference graph
ggml_backend_sched_reset(ctx->sched.get());
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
// set inputs
const auto & model = ctx->vision_model;
@@ -2934,9 +2748,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
// do nothing
}
else {
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
@@ -2963,14 +2774,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
}
ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
if (status != GGML_STATUS_SUCCESS) {
LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
return false;
if (ggml_backend_is_cpu(ctx->backend)) {
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
}
ggml_backend_graph_compute(ctx->backend, gf);
// the last node is the embedding tensor
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
@@ -2991,10 +2800,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
assert(itype < GGML_TYPE_COUNT);
ggml_type type = static_cast<ggml_type>(itype);
auto * ctx_clip = clip_init(fname_inp, clip_context_params{
/* use_gpu */ false,
/* verbosity */ 2,
});
auto * ctx_clip = clip_model_load(fname_inp, 2);
const auto & ctx_src = ctx_clip->ctx_gguf;
const auto & ctx_data = ctx_clip->ctx_data;
@@ -3152,9 +2958,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
return ctx->vision_model.mm_1_b->ne[0];
}
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
return ctx->vision_model.mm_input_proj_w->ne[0];
}
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));

View File

@@ -39,15 +39,8 @@ struct clip_image_f32_batch {
size_t size;
};
struct clip_context_params {
bool use_gpu;
int verbosity;
};
// deprecated, use clip_init
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
CLIP_API void clip_free(struct clip_ctx * ctx);

View File

@@ -1,341 +0,0 @@
#include "arg.h"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include "clip.h"
#include "stb_image.h"
#include "llama.h"
#include "ggml.h"
#include "console.h"
#include <vector>
#include <limits.h>
#include <inttypes.h>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <signal.h>
#endif
static bool g_is_generating = false;
/**
* Please note that this is NOT a production-ready stuff.
* It is a playground for trying Gemma 3 vision capabilities.
* For contributors: please keep this code simple and easy to understand.
*/
static void show_additional_info(int /*argc*/, char ** argv) {
LOG(
"Experimental CLI for using Gemma 3 vision model\n\n"
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
" -m and --mmproj are required\n"
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
argv[0]
);
}
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
static void sigint_handler(int signo) {
if (signo == SIGINT) {
if (g_is_generating) {
g_is_generating = false;
} else {
console::cleanup();
LOG("\nInterrupted by user\n");
_exit(130);
}
}
}
#endif
struct gemma3_context {
struct clip_ctx * ctx_clip = NULL;
common_init_result llama_init;
llama_model * model;
llama_context * lctx;
const llama_vocab * vocab;
llama_batch batch;
int n_threads = 1;
llama_pos n_past = 0;
gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) {
model = llama_init.model.get();
lctx = llama_init.context.get();
vocab = llama_model_get_vocab(model);
n_threads = params.cpuparams.n_threads;
batch = llama_batch_init(params.n_batch, 0, 1);
init_clip_model(params);
}
void init_clip_model(common_params & params) {
const char * clip_path = params.mmproj.c_str();
ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
}
~gemma3_context() {
clip_free(ctx_clip);
}
};
struct decode_embd_batch {
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id> seq_id_0;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
logits .resize(n_tokens);
seq_id_0.resize(1);
seq_id_0[0] = seq_id;
seq_ids [n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = pos_0 + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
};
static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
common_batch_clear(ctx.batch);
for (llama_token & t : tokens) {
common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
}
if (logits_last) {
ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
}
// LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
if (llama_decode(ctx.lctx, ctx.batch)) {
LOG_ERR("Failed to decode text\n");
return 1;
}
return 0;
}
static int eval_image(gemma3_context & ctx, std::string & fname) {
std::vector<float> image_embd_v;
int n_embd = llama_model_n_embd(ctx.model);
int n_tokens = 256;
image_embd_v.resize(n_tokens * n_embd);
bool ok;
struct clip_image_u8 * img_u8 = clip_image_u8_init();
ok = clip_image_load_from_file(fname.c_str(), img_u8);
if (!ok) {
LOG_ERR("Unable to load image %s\n", fname.c_str());
clip_image_u8_free(img_u8);
return 2; // non-fatal error
}
clip_image_f32_batch batch_f32;
ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32);
if (!ok) {
LOG_ERR("Unable to preprocess image\n");
clip_image_f32_batch_free(&batch_f32);
clip_image_u8_free(img_u8);
return 1;
}
int64_t t0 = ggml_time_ms();
LOG("Encoding image %s\n", fname.c_str());
ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data());
if (!ok) {
LOG_ERR("Unable to encode image\n");
clip_image_f32_batch_free(&batch_f32);
clip_image_u8_free(img_u8);
return 1;
}
LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
clip_image_f32_batch_free(&batch_f32);
clip_image_u8_free(img_u8);
// decode image embeddings
int64_t t1 = ggml_time_ms();
eval_text(ctx, "<start_of_image>");
llama_set_causal_attn(ctx.lctx, false);
decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
if (llama_decode(ctx.lctx, batch_img.batch)) {
LOG_ERR("failed to decode image\n");
return 1;
}
ctx.n_past += n_tokens;
llama_set_causal_attn(ctx.lctx, true);
eval_text(ctx, "<end_of_image>");
LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
return 0;
}
static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
for (int i = 0; i < n_predict; i++) {
if (i > n_predict || !g_is_generating) {
printf("\n");
break;
}
llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
common_sampler_accept(smpl, token_id, true);
if (llama_vocab_is_eog(ctx.vocab, token_id)) {
printf("\n");
break; // end of generation
}
printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
fflush(stdout);
// eval the token
common_batch_clear(ctx.batch);
common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
if (llama_decode(ctx.lctx, ctx.batch)) {
LOG_ERR("failed to decode token\n");
return 1;
}
}
return 0;
}
int main(int argc, char ** argv) {
ggml_time_init();
common_params params;
params.sampling.temp = 0.2; // lower temp by default for better quality
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
return 1;
}
common_init();
if (params.mmproj.empty()) {
show_additional_info(argc, argv);
return 1;
}
gemma3_context ctx(params);
printf("%s: %s\n", __func__, params.model.c_str());
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
// ctrl+C handling
{
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = sigint_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
}
if (eval_text(ctx, "<bos>")) {
return 1;
}
if (is_single_turn) {
g_is_generating = true;
if (eval_text(ctx, "<start_of_turn>user\n")) {
return 1;
}
for (auto & fname : params.image) {
if (eval_image(ctx, fname)) {
return 1;
}
}
if (eval_text(ctx, params.prompt + "<end_of_turn><start_of_turn>model\n", true)) {
return 1;
}
if (generate_response(ctx, smpl, n_predict)) {
return 1;
}
} else {
LOG("\n Running in chat mode, available commands:");
LOG("\n /image <path> load an image");
LOG("\n /clear clear the chat history");
LOG("\n /quit or /exit exit the program");
LOG("\n");
if (eval_text(ctx, "<start_of_turn>user\n")) {
return 1;
}
while (true) {
g_is_generating = false;
LOG("\n> ");
console::set_display(console::user_input);
std::string line;
console::readline(line, false);
console::set_display(console::reset);
line = string_strip(line);
if (line.empty()) {
continue;
}
if (line == "/quit" || line == "/exit") {
break;
}
if (line == "/clear") {
ctx.n_past = 0;
llama_kv_self_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
LOG("Chat history cleared\n\n");
continue;
}
g_is_generating = true;
if (line.find("/image") == 0) {
std::string image = line.substr(7);
int res = eval_image(ctx, image);
if (res == 2) {
continue; // image not found
}
if (res) {
return 1;
}
continue;
}
if (eval_text(ctx, line + "<end_of_turn><start_of_turn>model\n", true)) {
return 1;
}
if (generate_response(ctx, smpl, n_predict)) {
return 1;
}
if (eval_text(ctx, "<end_of_turn><start_of_turn>user\n")) {
return 1;
}
}
}
return 0;
}

View File

@@ -1,307 +0,0 @@
import gguf
import argparse
import logging
import sys
import torch
import json
import os
import numpy as np
from typing import cast, ContextManager, Any, Iterator
from pathlib import Path
from torch import Tensor
logger = logging.getLogger("gemma3-mmproj")
# (copied from convert_hf_to_gguf.py)
# tree of lazy tensors
class LazyTorchTensor(gguf.LazyBase):
_tensor_type = torch.Tensor
# to keep the type-checker happy
dtype: torch.dtype
shape: torch.Size
# only used when converting a torch.Tensor to a np.ndarray
_dtype_map: dict[torch.dtype, type] = {
torch.float16: np.float16,
torch.float32: np.float32,
}
# used for safetensors slices
# ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
# TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
_dtype_str_map: dict[str, torch.dtype] = {
"F64": torch.float64,
"F32": torch.float32,
"BF16": torch.bfloat16,
"F16": torch.float16,
# "U64": torch.uint64,
"I64": torch.int64,
# "U32": torch.uint32,
"I32": torch.int32,
# "U16": torch.uint16,
"I16": torch.int16,
"U8": torch.uint8,
"I8": torch.int8,
"BOOL": torch.bool,
"F8_E4M3": torch.float8_e4m3fn,
"F8_E5M2": torch.float8_e5m2,
}
def numpy(self) -> gguf.LazyNumpyTensor:
dtype = self._dtype_map[self.dtype]
return gguf.LazyNumpyTensor(
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
args=(self,),
func=(lambda s: s.numpy())
)
@classmethod
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
return torch.empty(size=shape, dtype=dtype, device="meta")
@classmethod
def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
dtype = cls._dtype_str_map[st_slice.get_dtype()]
shape: tuple[int, ...] = tuple(st_slice.get_shape())
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
return cast(torch.Tensor, lazy)
@classmethod
def __torch_function__(cls, func, types, args=(), kwargs=None):
del types # unused
if kwargs is None:
kwargs = {}
if func is torch.Tensor.numpy:
return args[0].numpy()
return cls._wrap_fn(func)(*args, **kwargs)
class Gemma3VisionTower:
hparams: dict
gguf_writer: gguf.GGUFWriter
fname_out: Path
ftype: gguf.LlamaFileType
@staticmethod
def load_hparams(dir_model: Path):
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
return json.load(f)
@staticmethod
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
part_names: list[str] = []
for filename in os.listdir(dir_model):
if filename.startswith(prefix) and filename.endswith(suffix):
part_names.append(filename)
part_names.sort()
return part_names
def __init__(self,
dir_model: Path,
fname_out: Path,
ftype: gguf.LlamaFileType,
is_big_endian: bool,):
hparams = Gemma3VisionTower.load_hparams(dir_model)
self.hparams = hparams
self.fname_out = fname_out
self.ftype = ftype
endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
self.gguf_writer = gguf.GGUFWriter(path=None, arch="clip", endianess=endianess)
text_config = hparams["text_config"]
vision_config = hparams["vision_config"]
assert hparams["architectures"][0] == "Gemma3ForConditionalGeneration"
assert text_config is not None
assert vision_config is not None
self.gguf_writer.add_string ("clip.projector_type", "gemma3")
self.gguf_writer.add_bool ("clip.has_text_encoder", False)
self.gguf_writer.add_bool ("clip.has_vision_encoder", True)
self.gguf_writer.add_bool ("clip.has_llava_projector", False) # legacy
self.gguf_writer.add_uint32 ("clip.vision.image_size", vision_config["image_size"])
self.gguf_writer.add_uint32 ("clip.vision.patch_size", vision_config["patch_size"])
self.gguf_writer.add_uint32 ("clip.vision.embedding_length", vision_config["hidden_size"])
self.gguf_writer.add_uint32 ("clip.vision.feed_forward_length", vision_config["intermediate_size"])
self.gguf_writer.add_uint32 ("clip.vision.projection_dim", text_config["hidden_size"])
self.gguf_writer.add_uint32 ("clip.vision.block_count", vision_config["num_hidden_layers"])
self.gguf_writer.add_uint32 ("clip.vision.attention.head_count", vision_config["num_attention_heads"])
self.gguf_writer.add_float32("clip.vision.attention.layer_norm_epsilon", vision_config.get("layer_norm_eps", 1e-6))
# default values taken from HF tranformers code
self.gguf_writer.add_array ("clip.vision.image_mean", [0.5, 0.5, 0.5])
self.gguf_writer.add_array ("clip.vision.image_std", [0.5, 0.5, 0.5])
self.gguf_writer.add_bool ("clip.use_gelu", True)
# load tensors
for name, data_torch in self.get_tensors(dir_model):
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
self.add_tensor(name, data_torch)
def get_tensors(self, dir_model: Path) -> Iterator[tuple[str, Tensor]]:
part_names = Gemma3VisionTower.get_model_part_names(dir_model, "model", ".safetensors")
tensor_names_from_parts: set[str] = set()
for part_name in part_names:
logger.info(f"gguf: loading model part '{part_name}'")
from safetensors import safe_open
ctx = cast(ContextManager[Any], safe_open(dir_model / part_name, framework="pt", device="cpu"))
with ctx as model_part:
tensor_names_from_parts.update(model_part.keys())
for name in model_part.keys():
data = model_part.get_slice(name)
data = LazyTorchTensor.from_safetensors_slice(data)
yield name, data
def add_tensor(self, name: str, data_torch: Tensor):
is_1d = len(data_torch.shape) == 1
is_embd = ".embeddings." in name
old_dtype = data_torch.dtype
can_quantize = not is_1d and not is_embd
data_qtype = gguf.GGMLQuantizationType.F32
# this is to support old checkpoint
# TODO: remove this when we have the final model
name = name.replace("vision_model.vision_model.", "vision_tower.vision_model.")
name = name.replace("multimodal_projector.", "multi_modal_projector.")
# filter only vision tensors
if not name.startswith("vision_tower.vision_model.") and not name.startswith("multi_modal_projector."):
return
# prefix
name = name.replace("vision_tower.vision_model.encoder.layers.", "v.blk.")
name = name.replace("vision_tower.vision_model.", "v.")
# projector and input embd
name = name.replace(".embeddings.patch_embedding.", ".patch_embd.")
name = name.replace(".embeddings.position_embedding.", ".position_embd.")
name = name.replace(
"multi_modal_projector.mm_input_projection_weight",
"mm.input_projection.weight"
)
name = name.replace(
"multi_modal_projector.mm_soft_emb_norm.weight",
"mm.soft_emb_norm.weight"
)
name = name.replace("post_layernorm.", "post_ln.")
# each block
name = name.replace(".self_attn.k_proj.", ".attn_k.")
name = name.replace(".self_attn.v_proj.", ".attn_v.")
name = name.replace(".self_attn.q_proj.", ".attn_q.")
name = name.replace(".self_attn.out_proj.", ".attn_out.")
name = name.replace(".layer_norm1.", ".ln1.")
name = name.replace(".layer_norm2.", ".ln2.")
name = name.replace(".mlp.fc1.", ".ffn_down.")
name = name.replace(".mlp.fc2.", ".ffn_up.")
if can_quantize:
if self.ftype == gguf.LlamaFileType.ALL_F32:
data_qtype = gguf.GGMLQuantizationType.F32
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
data_qtype = gguf.GGMLQuantizationType.F16
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
data_qtype = gguf.GGMLQuantizationType.BF16
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
data_qtype = gguf.GGMLQuantizationType.Q8_0
else:
raise ValueError(f"Unsupported file type: {self.ftype}")
# corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
# the other norm values are part of SigLIP model, and they are already correct
# ref code: Gemma3RMSNorm
if "soft_emb_norm.weight" in name:
logger.info(f"Correcting norm value for '{name}'")
data_torch = data_torch + 1
data = data_torch.numpy()
try:
data = gguf.quants.quantize(data, data_qtype)
except Exception as e:
logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16")
data_qtype = gguf.GGMLQuantizationType.F16
data = gguf.quants.quantize(data, data_qtype)
# reverse shape to make it similar to the internal ggml dimension order
shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}"
logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
def write(self):
self.gguf_writer.write_header_to_file(path=self.fname_out)
self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.write_tensors_to_file(progress=True)
self.gguf_writer.close()
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Convert Gemma 3 vision tower safetensors to GGUF format",)
parser.add_argument(
"--outfile", type=Path, default="mmproj.gguf",
help="path to write to",
)
parser.add_argument(
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16",
help="output format",
)
parser.add_argument(
"--bigendian", action="store_true",
help="model is executed on big endian machine",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file",
nargs="?",
)
parser.add_argument(
"--verbose", action="store_true",
help="increase output verbosity",
)
args = parser.parse_args()
if args.model is None:
parser.error("the following arguments are required: model")
return args
def main() -> None:
args = parse_args()
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
dir_model = args.model
if not dir_model.is_dir():
logger.error(f'Error: {args.model} is not a directory')
sys.exit(1)
ftype_map: dict[str, gguf.LlamaFileType] = {
"f32": gguf.LlamaFileType.ALL_F32,
"f16": gguf.LlamaFileType.MOSTLY_F16,
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
}
logger.info(f"Loading model: {dir_model.name}")
with torch.inference_mode():
gemma3_vision_tower = Gemma3VisionTower(
dir_model=dir_model,
fname_out=args.outfile,
ftype=ftype_map[args.outtype],
is_big_endian=args.bigendian,
)
gemma3_vision_tower.write()
if __name__ == '__main__':
main()

View File

@@ -86,11 +86,7 @@ static struct clip_ctx * clip_init_context(common_params * params) {
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
struct clip_context_params clip_params = {
/* use_gpu */ params->n_gpu_layers != 0,
/* verbosity */ params->verbosity,
};
auto * ctx_clip = clip_init(clip_path, clip_params);
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
return ctx_clip;
}
@@ -152,34 +148,19 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
if (num_image_embeds > 1) {
if (has_minicpmv_projector == 2) {
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
for (size_t j = 0; j < num_image_embeds_col; ++j) {
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
if (j == num_image_embeds_col - 1) {
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
}
}
}
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
}
else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) {
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
for (size_t j = 0; j < num_image_embeds_col; ++j) {
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
if (j == num_image_embeds_col - 1) {
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
}
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
for (size_t j = 0; j < num_image_embeds_col; ++j) {
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
if (j == num_image_embeds_col - 1) {
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
}
}
}
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
}
LOG_INF("%s: image token past: %d\n", __func__, n_past);
}

View File

@@ -597,6 +597,7 @@ elif args.minicpmv_projector is not None:
fname_middle = "mmproj-"
has_text_encoder = False
has_minicpmv_projector = True
minicpmv_version = 4
elif args.vision_only:
fname_middle = "vision-"
has_text_encoder = False

View File

@@ -96,7 +96,7 @@ int main(int argc, char ** argv) {
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
for (int s = 1; s < W + G + 1; ++s) {
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
}
const auto t_enc_end = ggml_time_us();
@@ -438,17 +438,17 @@ int main(int argc, char ** argv) {
// KV cache management
// if no verification token matched, we simply remove all cells from this batch -> no fragmentation
llama_kv_self_seq_rm(ctx, -1, n_past, -1);
llama_kv_cache_seq_rm(ctx, -1, n_past, -1);
if (seq_id_best != 0) {
// if a verification token matched, we keep the best sequence and remove the rest
// this leads to some KV cache fragmentation
llama_kv_self_seq_keep(ctx, seq_id_best);
llama_kv_self_seq_cp (ctx, seq_id_best, 0, -1, -1);
llama_kv_self_seq_rm (ctx, seq_id_best, -1, -1);
llama_kv_cache_seq_keep(ctx, seq_id_best);
llama_kv_cache_seq_cp (ctx, seq_id_best, 0, -1, -1);
llama_kv_cache_seq_rm (ctx, seq_id_best, -1, -1);
for (int s = 1; s < W + G + 1; ++s) {
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
}
}
}

View File

@@ -192,7 +192,7 @@ int main(int argc, char ** argv){
// KV cache management
// clean the cache of draft tokens that weren't accepted
llama_kv_self_seq_rm(ctx, 0, n_past, -1);
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
common_batch_clear(batch_tgt);
common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);

View File

@@ -27,24 +27,12 @@ Once downloaded, place your model in the models folder in llama.cpp.
##### Input prompt (One-and-done)
```bash
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time"
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
```
##### Conversation mode (Allow for continuous interaction with the model)
```bash
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma
```
##### Conversation mode using built-in jinja chat template
```bash
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja
```
##### One-and-done query using jinja with custom system prompt and a starting prompt
```bash
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello"
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
```
##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
@@ -56,24 +44,12 @@ Once downloaded, place your model in the models folder in llama.cpp.
##### Input prompt (One-and-done)
```powershell
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time"
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
```
##### Conversation mode (Allow for continuous interaction with the model)
```powershell
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma
```
##### Conversation mode using built-in jinja chat template
```powershell
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja
```
##### One-and-done query using jinja with custom system prompt and a starting prompt
```powershell
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello"
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
```
#### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
@@ -101,8 +77,6 @@ The `llama-cli` program provides several ways to interact with the LLaMA models
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
- `--system-prompt PROMPT`: Provide a system prompt (will otherwise use the default one in the chat template (if provided)).
- `--system-prompt-file FNAME`: Provide a file containing a system prompt.
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
## Interaction
@@ -115,10 +89,7 @@ In interactive mode, users can participate in text generation by injecting their
- `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
- `-cnv, --conversation`: Run the program in conversation mode (does not print special tokens and suffix/prefix, use default or provided chat template) (default: true if chat template found)
- `-no-cnv`: Disable conversation mode (default: false)
- `-st, --single-turn`: Only process a single conversation turn (user input) and then exit.
- `--jinja`: Enable jinja chat template parser, will use the model's built-in template or a user-provided one (default: false)
- `-cnv, --conversation`: Run the program in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: false)
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
@@ -154,8 +125,6 @@ When --in-prefix or --in-suffix options are enabled the chat template ( --chat-t
Example usage: `--chat-template gemma`
`--chat-template-file FNAME`: Load a custom jinja chat template from an external file, useful if the model contains outdated or incompatible template, some examples can be found in models/templates. Up-to-date chat templates can be downloaded from Hugging Face using scripts/get_chat_template.py
## Context Management
During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.

View File

@@ -354,7 +354,7 @@ int main(int argc, char ** argv) {
}
// remove any "future" tokens that we might have inherited from the previous session
llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1);
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
}
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
@@ -602,8 +602,8 @@ int main(int argc, char ** argv) {
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_self_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
n_past -= n_discard;
@@ -626,9 +626,9 @@ int main(int argc, char ** argv) {
LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
llama_kv_self_seq_add(ctx, 0, ga_i, n_past, ib*bd);
llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
n_past -= bd;

View File

@@ -202,7 +202,7 @@ int main(int argc, char ** argv) {
// assign the system KV cache to all parallel sequences
for (int32_t i = 1; i <= n_clients; ++i) {
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
LOG_INF("\n");
@@ -234,9 +234,9 @@ int main(int argc, char ** argv) {
if (batch.n_tokens == 0) {
// all sequences have ended - clear the entire KV cache
for (int i = 1; i <= n_clients; ++i) {
llama_kv_self_seq_rm(ctx, i, -1, -1);
llama_kv_cache_seq_rm(ctx, i, -1, -1);
// but keep the system prompt
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
LOG_INF("%s: clearing the KV cache\n", __func__);
@@ -372,8 +372,8 @@ int main(int argc, char ** argv) {
}
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
llama_kv_self_seq_rm(ctx, client.id + 1, -1, -1);
llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1);
llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
const auto t_main_end = ggml_time_us();

View File

@@ -133,11 +133,11 @@ int main(int argc, char ** argv) {
const int ib = i/n_batch - 1;
const int bd = n_batch_grp*(n_grp - 1);
llama_kv_self_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
llama_kv_self_update (ctx);
llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
llama_kv_cache_update (ctx);
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
}
common_batch_clear(batch);
@@ -167,12 +167,12 @@ int main(int argc, char ** argv) {
LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
//llama_kv_self_defrag (ctx);
llama_kv_self_update (ctx);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
//llama_kv_cache_defrag (ctx);
llama_kv_cache_update (ctx);
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
common_batch_clear(batch);
@@ -198,12 +198,12 @@ int main(int argc, char ** argv) {
if (n_discard > 0) {
LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
//llama_kv_self_defrag (ctx);
llama_kv_self_update (ctx);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
//llama_kv_cache_defrag (ctx);
llama_kv_cache_update (ctx);
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
}
}

View File

@@ -361,7 +361,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
llama_batch batch = llama_batch_init(n_batch, 0, 1);
@@ -547,7 +547,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
@@ -924,7 +924,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
return;
}
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
// decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1203,7 +1203,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
return;
}
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
// decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1575,7 +1575,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
return;
}
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
// decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1765,7 +1765,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
}
// clear the KV cache
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
llama_batch batch = llama_batch_init(n_batch, 0, 1);

View File

@@ -1,6 +1,6 @@
#include "ggml.h"
#include "llama.h"
#include "llama-model.h"
#include "llama-context.h"
#include "common.h"
#include <algorithm>
@@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
}
}
const auto & tensors = llama_internal_get_tensor_map(model);
const auto & tensors = llama_internal_get_tensor_map(ctx);
// check layer tensors
int included_layers = 0;

View File

@@ -83,7 +83,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
// run model
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);

View File

@@ -1,4 +1,2 @@
set(TARGET rpc-server)
add_executable(${TARGET} rpc-server.cpp)
target_link_libraries(${TARGET} PRIVATE ggml)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
add_executable(rpc-server rpc-server.cpp)
target_link_libraries(rpc-server PRIVATE ggml llama)

View File

@@ -72,14 +72,3 @@ $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name
This way you can offload model layers to both local and remote devices.
### Local cache
The RPC server can use a local cache to store large tensors and avoid transferring them over the network.
This can speed up model loading significantly, especially when using large models.
To enable the cache, use the `-c` option:
```bash
$ bin/rpc-server -c
```
By default, the cache is stored in the `$HOME/.cache/llama.cpp/rpc` directory and can be controlled via the `LLAMA_CACHE` environment variable.

View File

@@ -1,7 +1,3 @@
#if defined(_MSC_VER)
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif
#include "ggml-cpu.h"
#ifdef GGML_USE_CUDA
@@ -22,142 +18,26 @@
#include "ggml-rpc.h"
#ifdef _WIN32
# define DIRECTORY_SEPARATOR '\\'
# include <locale>
# include <windows.h>
# include <fcntl.h>
# include <io.h>
#else
# define DIRECTORY_SEPARATOR '/'
# include <unistd.h>
# include <sys/stat.h>
#endif
#include <codecvt>
#include <string>
#include <stdio.h>
#include <vector>
#include <filesystem>
namespace fs = std::filesystem;
// NOTE: this is copied from common.cpp to avoid linking with libcommon
// returns true if successful, false otherwise
static bool fs_create_directory_with_parents(const std::string & path) {
#ifdef _WIN32
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
std::wstring wpath = converter.from_bytes(path);
// if the path already exists, check whether it's a directory
const DWORD attributes = GetFileAttributesW(wpath.c_str());
if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
return true;
}
size_t pos_slash = 0;
// process path from front to back, procedurally creating directories
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
const std::wstring subpath = wpath.substr(0, pos_slash);
const wchar_t * test = subpath.c_str();
const bool success = CreateDirectoryW(test, NULL);
if (!success) {
const DWORD error = GetLastError();
// if the path already exists, ensure that it's a directory
if (error == ERROR_ALREADY_EXISTS) {
const DWORD attributes = GetFileAttributesW(subpath.c_str());
if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
return false;
}
} else {
return false;
}
}
pos_slash += 1;
}
return true;
#else
// if the path already exists, check whether it's a directory
struct stat info;
if (stat(path.c_str(), &info) == 0) {
return S_ISDIR(info.st_mode);
}
size_t pos_slash = 1; // skip leading slashes for directory creation
// process path from front to back, procedurally creating directories
while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
const std::string subpath = path.substr(0, pos_slash);
struct stat info;
// if the path already exists, ensure that it's a directory
if (stat(subpath.c_str(), &info) == 0) {
if (!S_ISDIR(info.st_mode)) {
return false;
}
} else {
// create parent directories
const int ret = mkdir(subpath.c_str(), 0755);
if (ret != 0) {
return false;
}
}
pos_slash += 1;
}
return true;
#endif // _WIN32
}
// NOTE: this is copied from common.cpp to avoid linking with libcommon
static std::string fs_get_cache_directory() {
std::string cache_directory = "";
auto ensure_trailing_slash = [](std::string p) {
// Make sure to add trailing slash
if (p.back() != DIRECTORY_SEPARATOR) {
p += DIRECTORY_SEPARATOR;
}
return p;
};
if (getenv("LLAMA_CACHE")) {
cache_directory = std::getenv("LLAMA_CACHE");
} else {
#ifdef __linux__
if (std::getenv("XDG_CACHE_HOME")) {
cache_directory = std::getenv("XDG_CACHE_HOME");
} else {
cache_directory = std::getenv("HOME") + std::string("/.cache/");
}
#elif defined(__APPLE__)
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
#elif defined(_WIN32)
cache_directory = std::getenv("LOCALAPPDATA");
#endif // __linux__
cache_directory = ensure_trailing_slash(cache_directory);
cache_directory += "llama.cpp";
}
return ensure_trailing_slash(cache_directory);
}
struct rpc_server_params {
std::string host = "127.0.0.1";
int port = 50052;
size_t backend_mem = 0;
bool use_cache = false;
};
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
fprintf(stderr, " -c, --cache enable local file cache\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
fprintf(stderr, "\n");
}
@@ -178,8 +58,6 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
if (params.port <= 0 || params.port > 65535) {
return false;
}
} else if (arg == "-c" || arg == "--cache") {
params.use_cache = true;
} else if (arg == "-m" || arg == "--mem") {
if (++i >= argc) {
return false;
@@ -286,20 +164,8 @@ int main(int argc, char * argv[]) {
} else {
get_backend_memory(&free_mem, &total_mem);
}
const char * cache_dir = nullptr;
std::string cache_dir_str = fs_get_cache_directory() + "rpc/";
if (params.use_cache) {
if (!fs_create_directory_with_parents(cache_dir_str)) {
fprintf(stderr, "Failed to create cache directory: %s\n", cache_dir_str.c_str());
return 1;
}
cache_dir = cache_dir_str.c_str();
}
printf("Starting RPC server\n");
printf(" endpoint : %s\n", endpoint.c_str());
printf(" local cache : %s\n", cache_dir ? cache_dir : "n/a");
printf(" backend memory : %zu MB\n", free_mem / (1024 * 1024));
ggml_backend_rpc_start_server(backend, endpoint.c_str(), cache_dir, free_mem, total_mem);
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
ggml_backend_free(backend);
return 0;
}

View File

@@ -38,6 +38,24 @@
}
#endif
GGML_ATTRIBUTE_FORMAT(1, 2)
static std::string fmt(const char * fmt, ...) {
va_list ap;
va_list ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
const int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
std::string buf;
buf.resize(size);
const int size2 = vsnprintf(const_cast<char *>(buf.data()), buf.size() + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return buf;
}
GGML_ATTRIBUTE_FORMAT(1, 2)
static int printe(const char * fmt, ...) {
va_list args;
@@ -61,7 +79,6 @@ class Opt {
ctx_params = llama_context_default_params();
model_params = llama_model_default_params();
context_size_default = ctx_params.n_batch;
n_threads_default = ctx_params.n_threads;
ngl_default = model_params.n_gpu_layers;
common_params_sampling sampling;
temperature_default = sampling.temp;
@@ -87,7 +104,6 @@ class Opt {
ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
ctx_params.n_ctx = ctx_params.n_batch;
ctx_params.n_threads = ctx_params.n_threads_batch = n_threads >= 0 ? n_threads : n_threads_default;
model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
temperature = temperature >= 0 ? temperature : temperature_default;
@@ -100,12 +116,12 @@ class Opt {
std::string chat_template_file;
std::string user;
bool use_jinja = false;
int context_size = -1, ngl = -1, n_threads = -1;
int context_size = -1, ngl = -1;
float temperature = -1;
bool verbose = false;
private:
int context_size_default = -1, ngl_default = -1, n_threads_default = -1;
int context_size_default = -1, ngl_default = -1;
float temperature_default = -1;
bool help = false;
@@ -143,94 +159,53 @@ class Opt {
return 0;
}
int parse_options_with_value(int argc, const char ** argv, int & i, bool & options_parsing) {
if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
if (handle_option_with_value(argc, argv, i, context_size) == 1) {
return 1;
}
} else if (options_parsing &&
(strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
if (handle_option_with_value(argc, argv, i, ngl) == 1) {
return 1;
}
} else if (options_parsing && (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--threads") == 0)) {
if (handle_option_with_value(argc, argv, i, n_threads) == 1) {
return 1;
}
} else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
if (handle_option_with_value(argc, argv, i, temperature) == 1) {
return 1;
}
} else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0) {
if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
return 1;
}
use_jinja = true;
} else {
return 2;
}
return 0;
}
int parse_options(const char ** argv, int & i, bool & options_parsing) {
if (options_parsing && (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
verbose = true;
} else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
use_jinja = true;
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
help = true;
return 0;
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
options_parsing = false;
} else {
return 2;
}
return 0;
}
int parse_positional_args(const char ** argv, int & i, int & positional_args_i) {
if (positional_args_i == 0) {
if (!argv[i][0] || argv[i][0] == '-') {
return 1;
}
++positional_args_i;
model_ = argv[i];
} else if (positional_args_i == 1) {
++positional_args_i;
user = argv[i];
} else {
user += " " + std::string(argv[i]);
}
return 0;
}
int parse(int argc, const char ** argv) {
bool options_parsing = true;
for (int i = 1, positional_args_i = 0; i < argc; ++i) {
int ret = parse_options_with_value(argc, argv, i, options_parsing);
if (ret == 0) {
continue;
} else if (ret == 1) {
return ret;
}
if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
if (handle_option_with_value(argc, argv, i, context_size) == 1) {
return 1;
}
} else if (options_parsing &&
(strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
if (handle_option_with_value(argc, argv, i, ngl) == 1) {
return 1;
}
} else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
if (handle_option_with_value(argc, argv, i, temperature) == 1) {
return 1;
}
} else if (options_parsing &&
(parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
verbose = true;
} else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
use_jinja = true;
} else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0){
if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
return 1;
}
use_jinja = true;
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
help = true;
return 0;
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
options_parsing = false;
} else if (positional_args_i == 0) {
if (!argv[i][0] || argv[i][0] == '-') {
return 1;
}
ret = parse_options(argv, i, options_parsing);
if (ret == 0) {
continue;
} else if (ret == 1) {
return ret;
}
if (parse_positional_args(argv, i, positional_args_i)) {
return 1;
++positional_args_i;
model_ = argv[i];
} else if (positional_args_i == 1) {
++positional_args_i;
user = argv[i];
} else {
user += " " + std::string(argv[i]);
}
}
if (model_.empty()) {
if (model_.empty()){
return 1;
}
@@ -257,8 +232,6 @@ class Opt {
" Number of GPU layers (default: %d)\n"
" --temp <value>\n"
" Temperature (default: %.1f)\n"
" -t, --threads <value>\n"
" Number of threads to use during generation (default: %d)\n"
" -v, --verbose, --log-verbose\n"
" Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
" -h, --help\n"
@@ -287,7 +260,7 @@ class Opt {
" llama-run file://some-file3.gguf\n"
" llama-run --ngl 999 some-file4.gguf\n"
" llama-run --ngl 999 some-file5.gguf Hello World\n",
context_size_default, ngl_default, temperature_default, n_threads_default);
context_size_default, ngl_default, temperature_default);
}
};
@@ -507,11 +480,11 @@ class HttpClient {
int secs = static_cast<int>(seconds) % 60;
if (hrs > 0) {
return string_format("%dh %02dm %02ds", hrs, mins, secs);
return fmt("%dh %02dm %02ds", hrs, mins, secs);
} else if (mins > 0) {
return string_format("%dm %02ds", mins, secs);
return fmt("%dm %02ds", mins, secs);
} else {
return string_format("%ds", secs);
return fmt("%ds", secs);
}
}
@@ -526,7 +499,7 @@ class HttpClient {
}
}
return string_format("%.2f %s", dbl_size, suffix[i]);
return fmt("%.2f %s", dbl_size, suffix[i]);
}
static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
@@ -560,9 +533,7 @@ class HttpClient {
return (now_downloaded_plus_file_size * 100) / total_to_download;
}
static std::string generate_progress_prefix(curl_off_t percentage) {
return string_format("%3ld%% |", static_cast<long int>(percentage));
}
static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", static_cast<long int>(percentage)); }
static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
const auto now = std::chrono::steady_clock::now();
@@ -573,9 +544,9 @@ class HttpClient {
static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download,
double speed, double estimated_time) {
const int width = 10;
return string_format("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(),
width, human_readable_size(total_to_download).c_str(), width,
human_readable_size(speed).c_str(), width, human_readable_time(estimated_time).c_str());
return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width,
human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width,
human_readable_time(estimated_time).c_str());
}
static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) {
@@ -920,7 +891,7 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama
// Function to tokenize the prompt
static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0;
const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0;
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
prompt_tokens.resize(n_prompt_tokens);
@@ -936,7 +907,7 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
// Check if we have enough space in the context to evaluate this batch
static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
const int n_ctx = llama_n_ctx(ctx.get());
const int n_ctx_used = llama_kv_self_used_cells(ctx.get());
const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
if (n_ctx_used + batch.n_tokens > n_ctx) {
printf(LOG_COL_DEFAULT "\n");
printe("context size exceeded\n");

View File

@@ -15,7 +15,7 @@ int main(int argc, char ** argv) {
return 1;
}
common_init();
print_build_info();
if (params.n_predict < 0) {
params.n_predict = 16;
@@ -196,7 +196,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
// erase whole kv
llama_kv_self_clear(ctx3);
llama_kv_cache_clear(ctx3);
fprintf(stderr, "%s : kv cache cleared\n", __func__);
// restore kv into seq 1

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@@ -384,9 +384,8 @@ struct server_task {
SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
common_grammar_trigger trigger;
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
trigger.value = word;
trigger.token = token;
params.sampling.grammar_triggers.push_back(std::move(trigger));
trigger.value = (llama_token) token;
params.sampling.grammar_triggers.push_back(trigger);
} else {
SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
@@ -489,12 +488,8 @@ struct result_timings {
double predicted_per_token_ms;
double predicted_per_second;
// Optional speculative metrics - only included when > 0
int32_t draft_n = 0;
int32_t draft_n_accepted = 0;
json to_json() const {
json base = {
return {
{"prompt_n", prompt_n},
{"prompt_ms", prompt_ms},
{"prompt_per_token_ms", prompt_per_token_ms},
@@ -505,13 +500,6 @@ struct result_timings {
{"predicted_per_token_ms", predicted_per_token_ms},
{"predicted_per_second", predicted_per_second},
};
if (draft_n > 0) {
base["draft_n"] = draft_n;
base["draft_n_accepted"] = draft_n_accepted;
}
return base;
}
};
@@ -762,10 +750,7 @@ struct server_task_result_cmpl_final : server_task_result {
{"name", tc.name},
{"arguments", tc.arguments},
}},
// Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
// We only generate a random id for the ones that don't generate one by themselves
// (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
{"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
{"id", tc.id},
});
}
message["tool_calls"] = tool_calls;
@@ -841,11 +826,6 @@ struct server_task_result_cmpl_final : server_task_result {
ret.push_back({"timings", timings.to_json()});
}
// extra fields for debugging purposes
if (verbose) {
ret["__verbose"] = to_json_non_oaicompat();
}
return ret;
}
};
@@ -1310,10 +1290,6 @@ struct server_slot {
std::function<void(int)> callback_on_release;
// Speculative decoding stats
int32_t n_draft_total = 0; // Total draft tokens generated
int32_t n_draft_accepted = 0; // Draft tokens actually accepted
void reset() {
SLT_DBG(*this, "%s", "\n");
@@ -1330,17 +1306,13 @@ struct server_slot {
generated_tokens.clear();
generated_token_probs.clear();
// clear speculative decoding stats
n_draft_total = 0;
n_draft_accepted = 0;
}
bool is_non_causal() const {
return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
}
bool can_batch_with(server_slot & other_slot) const {
bool can_batch_with(server_slot & other_slot) {
return is_non_causal() == other_slot.is_non_causal()
&& are_lora_equal(lora, other_slot.lora);
}
@@ -1400,12 +1372,6 @@ struct server_slot {
timings.predicted_per_token_ms = t_token_generation / n_decoded;
timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
// Add speculative metrics
if (n_draft_total > 0) {
timings.draft_n = n_draft_total;
timings.draft_n_accepted = n_draft_accepted;
}
return timings;
}
@@ -1453,15 +1419,6 @@ struct server_slot {
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
t_token_generation, n_decoded, t_gen, n_gen_second,
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
if (n_draft_total > 0) {
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
SLT_INF(*this,
"\n"
"draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
draft_ratio, n_draft_accepted, n_draft_total
);
}
}
json to_json() const {
@@ -1911,10 +1868,6 @@ struct server_context {
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
params_dft.n_parallel = 1;
// force F16 KV cache for the draft model for extra performance
params_dft.cache_type_k = GGML_TYPE_F16;
params_dft.cache_type_v = GGML_TYPE_F16;
llama_init_dft = common_init_from_params(params_dft);
model_dft = llama_init_dft.model.get();
@@ -1935,6 +1888,10 @@ struct server_context {
cparams_dft = common_context_params_to_llama(params_dft);
cparams_dft.n_batch = n_ctx_dft;
// force F16 KV cache for the draft model for extra performance
cparams_dft.type_k = GGML_TYPE_F16;
cparams_dft.type_v = GGML_TYPE_F16;
// the context is not needed - we will create one for each slot
llama_init_dft.context.reset();
}
@@ -2079,18 +2036,6 @@ struct server_context {
return ret;
}
bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
for (const auto & token : tokens) {
if (token < 0 || token >= n_vocab) {
return false;
}
}
return true;
}
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
slot.reset();
slot.id_task = task.id;
@@ -2105,11 +2050,6 @@ struct server_context {
slot.lora = task.params.lora;
}
bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
if (!can_detokenize) {
send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
return false;
}
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@@ -2152,7 +2092,7 @@ struct server_context {
SRV_DBG("%s", "clearing KV cache\n");
// clear the entire KV cache
llama_kv_self_clear(ctx);
llama_kv_cache_clear(ctx);
clean_kv_cache = false;
}
@@ -2217,6 +2157,14 @@ struct server_context {
}
if (slot.has_new_line) {
// if we have already seen a new line, we stop after a certain time limit
if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
slot.stop = STOP_TYPE_LIMIT;
slot.has_next_token = false;
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
}
// require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
if (slot.params.n_indent > 0) {
// check the current indentation
@@ -2255,14 +2203,6 @@ struct server_context {
// check if there is a new line in the generated text
if (result.text_to_send.find('\n') != std::string::npos) {
slot.has_new_line = true;
// if we have seen a new line, we stop after a certain time limit, but only upon another new line
if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
slot.stop = STOP_TYPE_LIMIT;
slot.has_next_token = false;
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
}
}
// if context shift is disabled, we stop when it reaches the context limit
@@ -2694,8 +2634,8 @@ struct server_context {
res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
res->t_start = metrics.t_start;
res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
res->kv_cache_used_cells = llama_kv_self_used_cells(ctx);
res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx);
res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
res->t_prompt_processing_total = metrics.t_prompt_processing_total;
@@ -2811,7 +2751,7 @@ struct server_context {
// Erase token cache
const size_t n_erased = slot->cache_tokens.size();
llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
slot->cache_tokens.clear();
auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2879,8 +2819,8 @@ struct server_context {
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
if (slot.params.cache_prompt) {
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -3071,8 +3011,8 @@ struct server_context {
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
for (size_t i = 0; i < n_match; i++) {
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -3110,9 +3050,9 @@ struct server_context {
}
// keep only the common part
if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
// could not partially delete (likely using a non-Transformer model)
llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
// there is no common part left
slot.n_past = 0;
@@ -3324,9 +3264,6 @@ struct server_context {
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
// keep track of total number of tokens generated in the draft
slot.n_draft_total += draft.size();
// ignore small drafts
if (slot.params.speculative.n_min > (int) draft.size()) {
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
@@ -3352,13 +3289,10 @@ struct server_context {
slot.n_past += ids.size();
slot.n_decoded += ids.size();
// update how many tokens out of draft was accepted
slot.n_draft_accepted += ids.size() - 1;
slot.cache_tokens.push_back(id);
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
for (size_t i = 0; i < ids.size(); ++i) {
completion_token_output result;
@@ -4499,24 +4433,15 @@ int main(int argc, char ** argv) {
llama_backend_free();
};
// bind HTTP listen port
bool was_bound = false;
if (string_ends_with(std::string(params.hostname), ".sock")) {
LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
svr->set_address_family(AF_UNIX);
// bind_to_port requires a second arg, any value other than 0 should
// simply get ignored
was_bound = svr->bind_to_port(params.hostname, 8080);
} else {
LOG_INF("%s: binding port with default address family\n", __func__);
// bind HTTP listen port
if (params.port == 0) {
int bound_port = svr->bind_to_any_port(params.hostname);
if ((was_bound = (bound_port >= 0))) {
params.port = bound_port;
}
} else {
was_bound = svr->bind_to_port(params.hostname, params.port);
if (params.port == 0) {
int bound_port = svr->bind_to_any_port(params.hostname);
if ((was_bound = (bound_port >= 0))) {
params.port = bound_port;
}
} else {
was_bound = svr->bind_to_port(params.hostname, params.port);
}
if (!was_bound) {

View File

@@ -92,7 +92,6 @@ def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict
assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
tool_call = tool_calls[0]
assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
assert expected_function_name == tool_call["function"]["name"]
actual_arguments = tool_call["function"]["arguments"]
@@ -374,7 +373,6 @@ def do_test_weather(server: ServerProcess, **kwargs):
tool_call = tool_calls[0]
# assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"], f'Expected weather tool call, got {tool_call["function"]["name"]}'
assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
actual_arguments = json.loads(tool_call["function"]["arguments"])
assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
location = actual_arguments["location"]
@@ -598,7 +596,6 @@ def do_test_hello_world(server: ServerProcess, **kwargs):
tool_call = tool_calls[0]
# assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
actual_arguments = json.loads(tool_call["function"]["arguments"])
assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
code = actual_arguments["code"]

View File

@@ -302,7 +302,7 @@ class ServerPreset:
server.model_hf_repo = "ggml-org/models"
server.model_hf_file = "tinyllamas/stories260K.gguf"
server.model_alias = "tinyllama-2"
server.n_ctx = 512
server.n_ctx = 256
server.n_batch = 32
server.n_slots = 2
server.n_predict = 64

View File

@@ -435,10 +435,6 @@ static std::string gen_chatcmplid() {
return "chatcmpl-" + random_string();
}
static std::string gen_tool_call_id() {
return random_string();
}
//
// other common utils
//
@@ -621,9 +617,7 @@ static json oaicompat_completion_params_parse(
llama_params["chat_format"] = static_cast<int>(chat_params.format);
llama_params["prompt"] = chat_params.prompt;
if (!chat_params.grammar.empty()) {
llama_params["grammar"] = chat_params.grammar;
}
llama_params["grammar"] = chat_params.grammar;
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
auto grammar_triggers = json::array();
for (const auto & trigger : chat_params.grammar_triggers) {

View File

@@ -99,9 +99,13 @@ export default function ChatScreen() {
canvasData,
replaceMessageAndGenerate,
} = useAppContext();
const textarea = useOptimizedTextarea(prefilledMsg.content());
const [inputMsg, setInputMsg] = useState(prefilledMsg.content());
const inputRef = useRef<HTMLTextAreaElement>(null);
const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
const { extraContext, clearExtraContext } = useVSCodeContext(
inputRef,
setInputMsg
);
// TODO: improve this when we have "upload file" feature
const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined;
@@ -131,10 +135,9 @@ export default function ChatScreen() {
};
const sendNewMessage = async () => {
const lastInpMsg = textarea.value();
if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? ''))
return;
textarea.setValue('');
if (inputMsg.trim().length === 0 || isGenerating(currConvId ?? '')) return;
const lastInpMsg = inputMsg;
setInputMsg('');
scrollToBottom(false);
setCurrNodeId(-1);
// get the last message node
@@ -143,13 +146,13 @@ export default function ChatScreen() {
!(await sendMessage(
currConvId,
lastMsgNodeId,
lastInpMsg,
inputMsg,
currExtra,
onChunk
))
) {
// restore the input message if failed
textarea.setValue(lastInpMsg);
setInputMsg(lastInpMsg);
}
// OK
clearExtraContext();
@@ -192,13 +195,16 @@ export default function ChatScreen() {
// send the prefilled message if needed
sendNewMessage();
} else {
// otherwise, focus on the input
textarea.focus();
// otherwise, focus on the input and move the cursor to the end
if (inputRef.current) {
inputRef.current.focus();
inputRef.current.selectionStart = inputRef.current.value.length;
}
}
prefilledMsg.clear();
// no need to keep track of sendNewMessage
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [textarea.ref]);
}, [inputRef]);
// due to some timing issues of StorageUtils.appendMsg(), we need to make sure the pendingMsg is not duplicated upon rendering (i.e. appears once in the saved conversation and once in the pendingMsg)
const pendingMsgDisplay: MessageDisplay[] =
@@ -252,7 +258,9 @@ export default function ChatScreen() {
<textarea
className="textarea textarea-bordered w-full"
placeholder="Type a message (Shift+Enter to add a new line)"
ref={textarea.ref}
ref={inputRef}
value={inputMsg}
onChange={(e) => setInputMsg(e.target.value)}
onKeyDown={(e) => {
if (e.nativeEvent.isComposing || e.keyCode === 229) return;
if (e.key === 'Enter' && e.shiftKey) return;
@@ -272,7 +280,11 @@ export default function ChatScreen() {
Stop
</button>
) : (
<button className="btn btn-primary ml-2" onClick={sendNewMessage}>
<button
className="btn btn-primary ml-2"
onClick={sendNewMessage}
disabled={inputMsg.trim().length === 0}
>
Send
</button>
)}
@@ -286,43 +298,3 @@ export default function ChatScreen() {
</div>
);
}
export interface OptimizedTextareaValue {
value: () => string;
setValue: (value: string) => void;
focus: () => void;
ref: React.RefObject<HTMLTextAreaElement>;
}
// This is a workaround to prevent the textarea from re-rendering when the inner content changes
// See https://github.com/ggml-org/llama.cpp/pull/12299
function useOptimizedTextarea(initValue: string): OptimizedTextareaValue {
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
const textareaRef = useRef<HTMLTextAreaElement>(null);
useEffect(() => {
if (textareaRef.current && savedInitValue) {
textareaRef.current.value = savedInitValue;
setSavedInitValue('');
}
}, [textareaRef, savedInitValue, setSavedInitValue]);
return {
value: () => {
return textareaRef.current?.value ?? savedInitValue;
},
setValue: (value: string) => {
if (textareaRef.current) {
textareaRef.current.value = value;
}
},
focus: () => {
if (textareaRef.current) {
// focus and move the cursor to the end
textareaRef.current.focus();
textareaRef.current.selectionStart = textareaRef.current.value.length;
}
},
ref: textareaRef,
};
}

View File

@@ -1,6 +1,5 @@
import { useEffect, useState } from 'react';
import { MessageExtraContext } from './types';
import { OptimizedTextareaValue } from '../components/ChatScreen';
// Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
// Ref: https://github.com/ggml-org/llama.cpp/pull/11940
@@ -15,7 +14,10 @@ interface SetTextEvData {
* window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n return 123' }, '*');
*/
export const useVSCodeContext = (textarea: OptimizedTextareaValue) => {
export const useVSCodeContext = (
inputRef: React.RefObject<HTMLTextAreaElement>,
setInputMsg: (text: string) => void
) => {
const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
null
);
@@ -25,20 +27,20 @@ export const useVSCodeContext = (textarea: OptimizedTextareaValue) => {
const handleMessage = (event: MessageEvent) => {
if (event.data?.command === 'setText') {
const data: SetTextEvData = event.data;
textarea.setValue(data?.text);
setInputMsg(data?.text);
if (data?.context && data.context.length > 0) {
setExtraContext({
type: 'context',
content: data.context,
});
}
textarea.focus();
inputRef.current?.focus();
}
};
window.addEventListener('message', handleMessage);
return () => window.removeEventListener('message', handleMessage);
}, [textarea]);
}, [inputRef, setInputMsg]);
// Add a keydown listener that sends the "escapePressed" message to the parent window
useEffect(() => {

View File

@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
auto generate = [&](const std::string & prompt) {
std::string response;
const bool is_first = llama_kv_self_used_cells(ctx) == 0;
const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0;
// tokenize the prompt
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
while (true) {
// check if we have enough space in the context to evaluate this batch
int n_ctx = llama_n_ctx(ctx);
int n_ctx_used = llama_kv_self_used_cells(ctx);
int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
if (n_ctx_used + batch.n_tokens > n_ctx) {
printf("\033[0m\n");
fprintf(stderr, "context size exceeded\n");

View File

@@ -217,7 +217,7 @@ int main(int argc, char ** argv) {
{
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
}
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {

View File

@@ -331,11 +331,11 @@ int main(int argc, char ** argv) {
}
active_seqs.erase(s);
for (int i = 0; i < n_seq_dft; i++) {
for(int i = 0; i < n_seq_dft; i++) {
if (i == s) {
continue;
}
if (drafts[i].active && drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
// synchronize active status for sequences with the same drafted token
drafts[i].active = drafts[i].active && accept;
if (!drafts[i].active) {
@@ -420,14 +420,14 @@ int main(int argc, char ** argv) {
{
LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
llama_kv_self_seq_keep(ctx_dft, s_keep);
llama_kv_self_seq_cp (ctx_dft, s_keep, 0, -1, -1);
llama_kv_self_seq_keep(ctx_dft, 0);
llama_kv_cache_seq_keep(ctx_dft, s_keep);
llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1);
llama_kv_cache_seq_keep(ctx_dft, 0);
llama_kv_self_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
llama_kv_self_seq_keep(ctx_tgt, s_keep);
llama_kv_self_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
llama_kv_self_seq_keep(ctx_tgt, 0);
llama_kv_cache_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
llama_kv_cache_seq_keep(ctx_tgt, s_keep);
llama_kv_cache_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
llama_kv_cache_seq_keep(ctx_tgt, 0);
}
for (int s = 0; s < n_seq_dft; ++s) {
@@ -444,7 +444,7 @@ int main(int argc, char ** argv) {
common_batch_clear(batch_dft);
common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
llama_decode(ctx_dft, batch_dft);
@@ -503,8 +503,8 @@ int main(int argc, char ** argv) {
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
llama_kv_self_seq_rm(ctx_dft, n_seq_cur, -1, -1);
llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
// all previous tokens from this branch are now also part of the new branch
for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@@ -585,9 +585,9 @@ int main(int argc, char ** argv) {
// evaluate the target model on the drafted tokens
{
llama_kv_self_seq_keep(ctx_tgt, 0);
llama_kv_cache_seq_keep(ctx_tgt, 0);
for (int s = 1; s < n_seq_dft; ++s) {
llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
}
// LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());

View File

@@ -87,11 +87,11 @@ struct wav_header {
uint32_t data_size;
};
static bool save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
static void save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
std::ofstream file(fname, std::ios::binary);
if (!file) {
LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str());
return false;
LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str());
return;
}
wav_header header;
@@ -108,7 +108,7 @@ static bool save_wav16(const std::string & fname, const std::vector<float> & dat
file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
}
return file.good();
file.close();
}
static void fill_hann_window(int length, bool periodic, float * output) {
@@ -536,7 +536,6 @@ static std::string audio_data_from_speaker(json speaker, const outetts_version t
int main(int argc, char ** argv) {
common_params params;
params.out_file = "output.wav";
params.prompt = "";
params.n_predict = 4096;
@@ -571,10 +570,6 @@ int main(int argc, char ** argv) {
model_ttc = llama_init_ttc.model.get();
ctx_ttc = llama_init_ttc.context.get();
if (model_ttc == nullptr || ctx_ttc == nullptr) {
return ENOENT;
}
const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
// TODO: refactor in a common struct
@@ -590,10 +585,6 @@ int main(int argc, char ** argv) {
model_cts = llama_init_cts.model.get();
ctx_cts = llama_init_cts.context.get();
if (model_cts == nullptr || ctx_cts == nullptr) {
return ENOENT;
}
std::vector<common_sampler *> smpl(n_parallel);
for (int i = 0; i < n_parallel; ++i) {
params.sampling.no_perf = (i != 0);
@@ -699,13 +690,11 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
const std::string voice_data = audio_data;
auto tmp = common_tokenize(vocab, voice_data, false, true);
std::ostringstream tokens_oss;
printf("\n\n");
for (size_t i = 0; i < tmp.size(); ++i) {
tokens_oss << tmp[i] << ", ";
printf("%d, ", tmp[i]);
}
LOG_INF("\n\n%s: llama tokens: %s\n\n", __func__, tokens_oss.str().c_str());
printf("\n\n");
prompt_add(prompt_inp, tmp);
#else
prompt_add(prompt_inp, llama_tokens {
@@ -1071,6 +1060,8 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
}
#endif
const std::string fname = "output.wav";
const int n_sr = 24000; // sampling rate
// zero out first 0.25 seconds
@@ -1081,15 +1072,11 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f);
LOG_INF("%s: total time: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
int retval = 0;
save_wav16(fname, audio, n_sr);
if (save_wav16(params.out_file, audio, n_sr)) {
LOG_INF("%s: audio written to file '%s'\n", __func__, params.out_file.c_str());
} else {
retval = ENOENT;
}
LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str());
llama_backend_free();
return retval;
return 0;
}

View File

@@ -100,10 +100,6 @@ else()
set(INS_ENB ON)
endif()
message(DEBUG "GGML_NATIVE : ${GGML_NATIVE}")
message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
message(DEBUG "INS_ENB : ${INS_ENB}")
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
@@ -127,12 +123,10 @@ endif()
option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
option(GGML_VXE "ggml: enable vxe" ON)
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
if (WIN32)
@@ -192,7 +186,6 @@ option(GGML_OPENMP "ggml: use OpenMP"
option(GGML_RPC "ggml: use RPC" OFF)
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
"ggml: sycl target device")
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
@@ -202,8 +195,6 @@ option(GGML_OPENCL "ggml: use OpenCL"
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
"gmml: OpenCL API version to target")
# toolchain for vulkan-shaders-gen
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")

View File

@@ -1,22 +0,0 @@
find_package(Git)
# the commit's SHA1
execute_process(COMMAND
"${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
OUTPUT_VARIABLE GIT_SHA1
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
# the date of the commit
execute_process(COMMAND
"${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
OUTPUT_VARIABLE GIT_DATE
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
# the subject of the commit
execute_process(COMMAND
"${GIT_EXECUTABLE}" log -1 --format=%s
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)

View File

@@ -1,26 +0,0 @@
function(ggml_get_flags CCID CCVER)
set(C_FLAGS "")
set(CXX_FLAGS "")
if (CCID MATCHES "Clang")
set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
if (
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
)
list(APPEND C_FLAGS -Wdouble-promotion)
endif()
elseif (CCID STREQUAL "GNU")
set(C_FLAGS -Wdouble-promotion)
set(CXX_FLAGS -Wno-array-bounds)
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
list(APPEND CXX_FLAGS -Wextra-semi)
endif()
endif()
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
endfunction()

View File

@@ -5,7 +5,7 @@
set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
find_package(Threads REQUIRED)

View File

@@ -17,9 +17,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
const char * cache_dir,
size_t free_mem, size_t total_mem);
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);

View File

@@ -454,7 +454,6 @@ extern "C" {
GGML_OP_RMS_NORM,
GGML_OP_RMS_NORM_BACK,
GGML_OP_GROUP_NORM,
GGML_OP_L2_NORM,
GGML_OP_MUL_MAT,
GGML_OP_MUL_MAT_ID,
@@ -503,7 +502,6 @@ extern "C" {
GGML_OP_ADD_REL_POS,
GGML_OP_RWKV_WKV6,
GGML_OP_GATED_LINEAR_ATTN,
GGML_OP_RWKV_WKV7,
GGML_OP_UNARY,
@@ -1097,18 +1095,6 @@ extern "C" {
int n_groups,
float eps);
// l2 normalize along rows
// used in rwkv v7
GGML_API struct ggml_tensor * ggml_l2_norm(
struct ggml_context * ctx,
struct ggml_tensor * a,
float eps);
GGML_API struct ggml_tensor * ggml_l2_norm_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
float eps);
// a - x
// b - dy
GGML_API struct ggml_tensor * ggml_rms_norm_back(
@@ -1791,11 +1777,11 @@ extern "C" {
#define GGML_KQ_MASK_PAD 64
// q: [n_embd_k, n_batch, n_head, 1]
// k: [n_embd_k, n_kv, n_head_kv, 1]
// v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
// res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
// q: [n_embd, n_batch, n_head, 1]
// k: [n_embd, n_kv, n_head_kv, 1]
// v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
// res: [n_embd, n_head, n_batch, 1] !! permuted !!
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
struct ggml_context * ctx,
struct ggml_tensor * q,
@@ -1904,16 +1890,6 @@ extern "C" {
struct ggml_tensor * state,
float scale);
GGML_API struct ggml_tensor * ggml_rwkv_wkv7(
struct ggml_context * ctx,
struct ggml_tensor * r,
struct ggml_tensor * w,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * state);
// custom operators
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);

View File

@@ -1,5 +1,4 @@
include(CheckCXXCompilerFlag)
include("../cmake/common.cmake")
add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
@@ -25,6 +24,33 @@ if (NOT MSVC)
endif()
endif()
function(ggml_get_flags CCID CCVER)
set(C_FLAGS "")
set(CXX_FLAGS "")
if (CCID MATCHES "Clang")
set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
if (
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
)
list(APPEND C_FLAGS -Wdouble-promotion)
endif()
elseif (CCID STREQUAL "GNU")
set(C_FLAGS -Wdouble-promotion)
set(CXX_FLAGS -Wno-array-bounds)
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
list(APPEND CXX_FLAGS -Wextra-semi)
endif()
endif()
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
endfunction()
if (GGML_FATAL_WARNINGS)
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
list(APPEND C_FLAGS -Werror)
@@ -65,7 +91,7 @@ if (GGML_LTO)
endif()
endif()
if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER)
if (GGML_CCACHE)
find_program(GGML_CCACHE_FOUND ccache)
find_program(GGML_SCCACHE_FOUND sccache)
@@ -76,11 +102,7 @@ if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAU
set(GGML_CCACHE_VARIANT sccache)
endif()
# TODO: should not be set globally
if (GGML_SYCL AND GGML_CCACHE_FOUND AND WIN32)
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "ccache compiler_type=icl")
else ()
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
endif ()
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
set(ENV{CCACHE_SLOPPINESS} time_macros)
message(STATUS "${GGML_CCACHE_VARIANT} found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
else()
@@ -329,10 +351,6 @@ if (CMAKE_SYSTEM_NAME MATCHES "Android")
target_link_libraries(ggml-base PRIVATE dl)
endif()
if(CMAKE_SYSTEM_NAME MATCHES "visionOS")
target_compile_definitions(ggml-base PUBLIC _DARWIN_C_SOURCE)
endif()
if (BUILD_SHARED_LIBS)
foreach (target ggml-base ggml)
set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)

View File

@@ -76,14 +76,7 @@ namespace fs = std::filesystem;
static std::string path_str(const fs::path & path) {
std::string u8path;
try {
#if defined(__cpp_lib_char8_t)
// C++20 and later: u8string() returns std::u8string
std::u8string u8str = path.u8string();
u8path = std::string(reinterpret_cast<const char*>(u8str.c_str()));
#else
// C++17: u8string() returns std::string
u8path = path.u8string();
#endif
} catch (...) {
}
return u8path;
@@ -497,7 +490,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
search_paths.push_back(get_executable_path());
search_paths.push_back(fs::current_path());
} else {
search_paths.push_back(fs::u8path(user_search_path));
search_paths.push_back(user_search_path);
}
int best_score = 0;
@@ -511,9 +504,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
for (const auto & entry : dir_it) {
if (entry.is_regular_file()) {
auto filename = entry.path().filename();
auto ext = entry.path().extension();
if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
auto filename = entry.path().filename().native();
auto ext = entry.path().extension().native();
if (filename.find(file_prefix) == 0 && ext == file_extension) {
dl_handle_ptr handle { dl_load_library(entry) };
if (!handle && !silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
@@ -544,7 +537,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
// try to load the base backend
for (const auto & search_path : search_paths) {
fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
fs::path path = search_path / filename;
fs::path path = search_path.native() + filename.native();
if (fs::exists(path)) {
return get_reg().load_backend(path, silent);
}

View File

@@ -0,0 +1,168 @@
---
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -1
AlignAfterOpenBracket: Align
AlignConsecutiveMacros: false
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: true
AllowAllArgumentsOnNextLine: true
AllowAllConstructorInitializersOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AllowShortLambdasOnASingleLine: All
AllowShortIfStatementsOnASingleLine: WithoutElse
AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: Yes
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterCaseLabel: false
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeInheritanceComma: false
BreakInheritanceList: BeforeColon
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeColon
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DeriveLineEnding: true
DerivePointerAlignment: true
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '^<ext/.*\.h>'
Priority: 2
SortPriority: 0
- Regex: '^<.*\.h>'
Priority: 1
SortPriority: 0
- Regex: '^<.*'
Priority: 2
SortPriority: 0
- Regex: '.*'
Priority: 3
SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentCaseLabels: true
IndentGotoLabels: true
IndentPPDirectives: None
IndentWidth: 4
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Never
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
RawStringFormats:
- Language: Cpp
Delimiters:
- cc
- CC
- cpp
- Cpp
- CPP
- 'c++'
- 'C++'
CanonicalDelimiter: ''
BasedOnStyle: google
- Language: TextProto
Delimiters:
- pb
- PB
- proto
- PROTO
EnclosingFunctions:
- EqualsProto
- EquivToProto
- PARSE_PARTIAL_TEXT_PROTO
- PARSE_TEST_PROTO
- PARSE_TEXT_PROTO
- ParseTextOrDie
- ParseTextProtoOrDie
CanonicalDelimiter: ''
BasedOnStyle: google
ReflowComments: true
SortIncludes: true
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: false
SpacesInConditionalStatement: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SpaceBeforeSquareBrackets: false
Standard: Auto
StatementMacros:
- Q_UNUSED
- QT_REQUIRE_VERSION
TabWidth: 8
UseCRLF: false
UseTab: Never
...

View File

@@ -2790,14 +2790,10 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
output_ne_offset);
int64_t antiquantGroupSize = 0;
if (src0->ne[0] > QK8_0) {
antiquantGroupSize = QK8_0;
}
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor,
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
&workspaceSize, &executor));
if (workspaceAddr == nullptr) {
workspaceAddr = workspace_allocator.alloc(workspaceSize);
@@ -2837,7 +2833,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
nullptr, nullptr, nullptr, nullptr, QK8_0,
acl_output_tensor, &workspaceSize, &executor));
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
workspaceAddr, workspaceSize, executor, ctx.stream()));

View File

@@ -1689,6 +1689,11 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
case GGML_OP_MUL_MAT: {
switch (op->src[0]->type) {
case GGML_TYPE_Q8_0:
// Current groupsize should not be greater than k-1 in
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
if (op->src[0]->ne[0] <= QK8_0) {
return false;
}
case GGML_TYPE_F16:
case GGML_TYPE_F32:
case GGML_TYPE_Q4_0:

View File

@@ -158,12 +158,6 @@ typedef sycl::half2 ggml_half2;
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
#ifdef _MSC_VER
#define GGML_EXTENSION
#else // _MSC_VER
#define GGML_EXTENSION __extension__
#endif // _MSC_VER
#define QK4_0 32
typedef struct {
ggml_half d; // delta
@@ -173,7 +167,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 b
#define QK4_1 32
typedef struct {
GGML_EXTENSION union {
union {
struct {
ggml_half d; // delta
ggml_half m; // min
@@ -194,7 +188,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0
#define QK5_1 32
typedef struct {
GGML_EXTENSION union {
union {
struct {
ggml_half d; // delta
ggml_half m; // min
@@ -215,7 +209,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block
#define QK8_1 32
typedef struct {
GGML_EXTENSION union {
union {
struct {
ggml_half d; // delta
ggml_half s; // d * sum(qs[i])
@@ -256,7 +250,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0
typedef struct {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
GGML_EXTENSION union {
union {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
@@ -283,7 +277,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12
// weight is represented as x = a * q + b
// Effectively 4.5 bits per weight
typedef struct {
GGML_EXTENSION union {
union {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
@@ -300,7 +294,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2,
// weight is represented as x = a * q + b
// Effectively 5.5 bits per weight
typedef struct {
GGML_EXTENSION union {
union {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins

View File

@@ -23,11 +23,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
ggml-cpu/amx/mmq.cpp
ggml-cpu/amx/mmq.h
ggml-cpu/ggml-cpu-impl.h
ggml-cpu/common.h
ggml-cpu/binary-ops.h
ggml-cpu/binary-ops.cpp
ggml-cpu/unary-ops.h
ggml-cpu/unary-ops.cpp
)
target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
@@ -292,31 +287,17 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif()
endif()
endif()
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected")
if (GGML_NATIVE)
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
file(READ "/proc/cpuinfo" POWER10_M)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
endif()
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
if (EXTRACTED_NUMBER GREATER_EQUAL 10)
list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
elseif (EXTRACTED_NUMBER EQUAL 9)
list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
else()
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
endif()
execute_process(COMMAND bash -c "grep POWER /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER_M)
if (${POWER_M} MATCHES "POWER10")
list(APPEND ARCH_FLAGS -mcpu=power10)
elseif (${POWER_M} MATCHES "POWER9")
list(APPEND ARCH_FLAGS -mcpu=power9)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
else()
if (GGML_CPU_POWERPC_CPUTYPE)
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
endif()
list(APPEND ARCH_FLAGS -mcpu=powerpc64 -mtune=native)
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
message(STATUS "loongarch64 detected")
@@ -331,11 +312,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
message(STATUS "RISC-V detected")
if (GGML_RVV)
if (GGML_RV_ZFH)
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
else()
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
endif()
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
message(STATUS "s390x detected")
@@ -374,9 +351,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# Fetch KleidiAI sources:
include(FetchContent)
set(KLEIDIAI_COMMIT_TAG "v1.5.0")
set(KLEIDIAI_COMMIT_TAG "v1.3.0")
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
set(KLEIDIAI_ARCHIVE_MD5 "ea22e1aefb800e9bc8c74d91633cc58e")
set(KLEIDIAI_ARCHIVE_MD5 "060bd2dc64642b091f461cc8dd7426d9")
if (POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)

View File

@@ -1,158 +0,0 @@
#include "binary-ops.h"
#if defined(GGML_USE_ACCELERATE)
#include <Accelerate/Accelerate.h>
using vDSP_fn_t = void (*)(const float *, vDSP_Stride, const float *, vDSP_Stride, float *, vDSP_Stride, vDSP_Length);
#endif
static inline float op_add(float a, float b) {
return a + b;
}
static inline float op_sub(float a, float b) {
return a - b;
}
static inline float op_mul(float a, float b) {
return a * b;
}
static inline float op_div(float a, float b) {
return a / b;
}
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
static inline void vec_binary_op_contiguous(const int64_t n, dst_t * z, const src0_t * x, const src1_t * y) {
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
for (int i = 0; i < n; i++) {
z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(y[i])));
}
}
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
static inline void vec_binary_op_non_contiguous(const int64_t n, const int64_t ne10, const int64_t nb10, dst_t * z, const src0_t * x, const src1_t * y) {
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
for (int i = 0; i < n; i++) {
int i10 = i % ne10;
const src1_t * y_ptr = (const src1_t *)((const char *)y + i10*nb10);
z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(*y_ptr)));
}
}
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
static void apply_binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT( nb0 == sizeof(dst_t));
GGML_ASSERT(nb00 == sizeof(src0_t));
const auto [ir0, ir1] = get_thread_range(params, src0);
const bool is_src1_contiguous = (nb10 == sizeof(src1_t));
if (!is_src1_contiguous) { // broadcast not implemented yet for non-contiguous
GGML_ASSERT(ggml_are_same_shape(src0, src1));
}
#ifdef GGML_USE_ACCELERATE
vDSP_fn_t vDSP_op = nullptr;
// TODO - avoid the f32-only check using type 'trait' lookup tables and row-based src-to-float conversion functions
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
if (op == op_add) {
vDSP_op = vDSP_vadd;
} else if (op == op_sub) {
vDSP_op = vDSP_vsub;
} else if (op == op_mul) {
vDSP_op = vDSP_vmul;
} else if (op == op_div) {
vDSP_op = vDSP_vdiv;
}
}
#endif
for (int64_t ir = ir0; ir < ir1; ++ir) {
const int64_t i03 = ir/(ne02*ne01);
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
const src1_t * src1_ptr = (const src1_t *) ((const char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
if (is_src1_contiguous) {
// src1 is broadcastable across src0 and dst in i1, i2, i3
const int64_t nr0 = ne00 / ne10;
for (int64_t r = 0; r < nr0; ++r) {
#ifdef GGML_USE_ACCELERATE
if constexpr (std::is_same_v<src0_t, float> && std::is_same_v<src1_t, float> && std::is_same_v<dst_t, float>) {
if (vDSP_op != nullptr) {
vDSP_op(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
continue;
}
}
#endif
vec_binary_op_contiguous<op>(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
}
} else {
vec_binary_op_non_contiguous<op>(ne0, ne10, nb10, dst_ptr, src0_ptr, src1_ptr);
}
}
}
// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
template <float (*op)(float, float)>
static void binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
/* */ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
apply_binary_op<op, float, float, float>(params, dst);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
apply_binary_op<op, ggml_fp16_t, ggml_fp16_t, ggml_fp16_t>(params, dst);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
apply_binary_op<op, ggml_bf16_t, ggml_bf16_t, ggml_bf16_t>(params, dst);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_BF16) {
apply_binary_op<op, ggml_bf16_t, float, ggml_bf16_t>(params, dst);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
apply_binary_op<op, ggml_bf16_t, float, float>(params, dst);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
apply_binary_op<op, ggml_fp16_t, float, ggml_fp16_t>(params, dst);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
apply_binary_op<op, ggml_fp16_t, float, float>(params, dst);
} else {
GGML_ABORT("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
}
}
void ggml_compute_forward_add_non_quantized(const ggml_compute_params * params, ggml_tensor * dst) {
binary_op<op_add>(params, dst);
}
void ggml_compute_forward_sub(const ggml_compute_params * params, ggml_tensor * dst) {
binary_op<op_sub>(params, dst);
}
void ggml_compute_forward_mul(const ggml_compute_params * params, ggml_tensor * dst) {
binary_op<op_mul>(params, dst);
}
void ggml_compute_forward_div(const ggml_compute_params * params, ggml_tensor * dst) {
binary_op<op_div>(params, dst);
}

View File

@@ -1,16 +0,0 @@
#pragma once
#include "common.h"
#ifdef __cplusplus
extern "C" {
#endif
void ggml_compute_forward_add_non_quantized(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_sub(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_mul(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_div(const struct ggml_compute_params * params, struct ggml_tensor * dst);
#ifdef __cplusplus
}
#endif

View File

@@ -1,72 +0,0 @@
#pragma once
#include "ggml.h"
#include "ggml-cpu-traits.h"
#include "ggml-cpu-impl.h"
#include "ggml-impl.h"
#ifdef __cplusplus
#include <utility>
// convenience functions/macros for use in template calls
// note: these won't be required after the 'traits' lookup table is used.
static inline ggml_fp16_t f32_to_f16(float x) {
return GGML_FP32_TO_FP16(x);
}
static inline float f16_to_f32(ggml_fp16_t x) {
return GGML_FP16_TO_FP32(x);
}
static inline ggml_bf16_t f32_to_bf16(float x) {
return GGML_FP32_TO_BF16(x);
}
static inline float bf16_to_f32(ggml_bf16_t x) {
return GGML_BF16_TO_FP32(x);
}
static inline float f32_to_f32(float x) {
return x;
}
// TODO - merge this into the traits table, after using row-based conversions
template <class T>
struct type_conversion_table;
template <>
struct type_conversion_table<ggml_fp16_t> {
static constexpr float (*to_f32)(ggml_fp16_t) = f16_to_f32;
static constexpr ggml_fp16_t (*from_f32)(float) = f32_to_f16;
};
template <>
struct type_conversion_table<float> {
static constexpr float (*to_f32)(float) = f32_to_f32;
static constexpr float (*from_f32)(float) = f32_to_f32;
};
template <>
struct type_conversion_table<ggml_bf16_t> {
static constexpr float (*to_f32)(ggml_bf16_t) = bf16_to_f32;
static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
};
static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
const int64_t ith = params->ith;
const int64_t nth = params->nth;
const int64_t nr = ggml_nrows(src0);
// rows per thread
const int64_t dr = (nr + nth - 1)/nth;
// row range for this thread
const int64_t ir0 = dr*ith;
const int64_t ir1 = MIN(ir0 + dr, nr);
return {ir0, ir1};
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -51,10 +51,11 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
},
/* .lhs_info = */ {
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon,
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon,
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon,
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon,
/* .require_aligned_m_idx = */ true,
},
/* .rhs_info = */ {
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
@@ -99,6 +100,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
/* .require_aligned_m_idx = */ false,
},
/* .rhs_info = */ {
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@@ -142,6 +144,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
/* .require_aligned_m_idx = */ false,
},
/* .rhs_info = */ {
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@@ -186,6 +189,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
/* .require_aligned_m_idx = */ false,
},
/* .rhs_info = */ {
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@@ -229,6 +233,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
/* .require_aligned_m_idx = */ false,
},
/* .rhs_info = */ {
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,

View File

@@ -40,6 +40,7 @@ struct lhs_packing_info {
size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
size_t lhs_stride, void* lhs_packed);
bool require_aligned_m_idx;
};
struct rhs_packing_info {

View File

@@ -124,7 +124,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
size_t sr = kernel->get_sr();
// Calculate number of columns to be processed per thread
const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
const bool use_multithread = lhs_info->require_aligned_m_idx && m <= mr ? false : true;
const size_t num_m_per_thread = use_multithread ? kai_roundup(m, nth) / nth : m;
const size_t m_start = ith * num_m_per_thread;
size_t m_to_process = num_m_per_thread;
if ((m_start + m_to_process) > m) {
@@ -134,11 +135,11 @@ class tensor_traits : public ggml::cpu::tensor_traits {
if(m_start < m) {
// Transform LHS
const size_t src_stride = src1->nb[1];
const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(0, dst->src[1]->nb[1]));
const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, QK4_0, mr, kr, sr);
void * lhs_packed_ptr = static_cast<void *>(lhs_packed + lhs_packed_offset);
lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, m_start, src_ptr, src_stride, lhs_packed_ptr);
}
ggml_barrier(params->threadpool);

View File

@@ -55,7 +55,6 @@
#include <atomic>
#include <array>
#include <type_traits>
#ifdef _MSC_VER
#define NOINLINE __declspec(noinline)
@@ -1093,403 +1092,13 @@ class tinyBLAS_Q0_PPC {
}
}
template<typename VA, typename VB, int size>
void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, VA* vec, std::array<int, size>& comparray) {
template<typename VA, typename VB>
void packNormal(const TA* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
int64_t i, j;
TA *aoffset = NULL;
VA *vecOffset = NULL;
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
VB t1, t2, t3, t4, t5, t6, t7, t8;
const vector signed char lowMask = vec_splats((signed char)0xF);
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
const vector signed char v8 = vec_splats((signed char)0x8);
aoffset = const_cast<TA*>(a);
vecOffset = vec;
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
vector signed int vsum = {0};
vector signed int vsum2 = {0};
j = (rows >> 3);
if (j > 0) {
do {
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset5 = aoffset4 + lda;
aoffset6 = aoffset5 + lda;
aoffset7 = aoffset6 + lda;
aoffset8 = aoffset7 + lda;
aoffset += 8 * lda;
i = (cols >> 2);
if (i > 0) {
do {
c1[1] = reinterpret_cast<VB>(vec_xl(0, aoffset1->qs));
c2[1] = reinterpret_cast<VB>(vec_xl(0, aoffset2->qs));
c3[1] = reinterpret_cast<VB>(vec_xl(0, aoffset3->qs));
c4[1] = reinterpret_cast<VB>(vec_xl(0, aoffset4->qs));
c5[1] = reinterpret_cast<VB>(vec_xl(0, aoffset5->qs));
c6[1] = reinterpret_cast<VB>(vec_xl(0, aoffset6->qs));
c7[1] = reinterpret_cast<VB>(vec_xl(0, aoffset7->qs));
c8[1] = reinterpret_cast<VB>(vec_xl(0, aoffset8->qs));
c1[0] = vec_and(c1[1], lowMask);
c1[1] = vec_sr(c1[1], v4);
c1[0] = vec_sub(c1[0], v8);
c1[1] = vec_sub(c1[1], v8);
vsum = vec_sum4s(c1[0], vsum);
vsum2 = vec_sum4s(c1[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[0] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats(0);
c2[0] = vec_and(c2[1], lowMask);
c2[1] = vec_sr(c2[1], v4);
c2[0] = vec_sub(c2[0], v8);
c2[1] = vec_sub(c2[1], v8);
vsum = vec_sum4s(c2[0], vsum);
vsum2 = vec_sum4s(c2[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[1] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats(0);
c3[0] = vec_and(c3[1], lowMask);
c3[1] = vec_sr(c3[1], v4);
c3[0] = vec_sub(c3[0], v8);
c3[1] = vec_sub(c3[1], v8);
vsum = vec_sum4s(c3[0], vsum);
vsum2 = vec_sum4s(c3[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[2] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats(0);
c4[0] = vec_and(c4[1], lowMask);
c4[1] = vec_sr(c4[1], v4);
c4[0] = vec_sub(c4[0], v8);
c4[1] = vec_sub(c4[1], v8);
vsum = vec_sum4s(c4[0], vsum);
vsum2 = vec_sum4s(c4[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[3] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats(0);
c5[0] = vec_and(c5[1], lowMask);
c5[1] = vec_sr(c5[1], v4);
c5[0] = vec_sub(c5[0], v8);
c5[1] = vec_sub(c5[1], v8);
vsum = vec_sum4s(c5[0], vsum);
vsum2 = vec_sum4s(c5[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[4] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats(0);
c6[0] = vec_and(c6[1], lowMask);
c6[1] = vec_sr(c6[1], v4);
c6[0] = vec_sub(c6[0], v8);
c6[1] = vec_sub(c6[1], v8);
vsum = vec_sum4s(c6[0], vsum);
vsum2 = vec_sum4s(c6[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[5] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats(0);
c7[0] = vec_and(c7[1], lowMask);
c7[1] = vec_sr(c7[1], v4);
c7[0] = vec_sub(c7[0], v8);
c7[1] = vec_sub(c7[1], v8);
vsum = vec_sum4s(c7[0], vsum);
vsum2 = vec_sum4s(c7[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[6] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats(0);
c8[0] = vec_and(c8[1], lowMask);
c8[1] = vec_sr(c8[1], v4);
c8[0] = vec_sub(c8[0], v8);
c8[1] = vec_sub(c8[1], v8);
vsum = vec_sum4s(c8[0], vsum);
vsum2 = vec_sum4s(c8[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[7] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats(0);
t1 = vec_perm(c1[0], c2[0], swiz1);
t2 = vec_perm(c1[0], c2[0], swiz2);
t3 = vec_perm(c3[0], c4[0], swiz1);
t4 = vec_perm(c3[0], c4[0], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
vec_xst(t5, 0, vecOffset);
vec_xst(t6, 0, vecOffset+16);
vec_xst(t7, 0, vecOffset+32);
vec_xst(t8, 0, vecOffset+48);
t1 = vec_perm(c1[1], c2[1], swiz1);
t2 = vec_perm(c1[1], c2[1], swiz2);
t3 = vec_perm(c3[1], c4[1], swiz1);
t4 = vec_perm(c3[1], c4[1], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
vec_xst(t5, 0, vecOffset+64);
vec_xst(t6, 0, vecOffset+80);
vec_xst(t7, 0, vecOffset+96);
vec_xst(t8, 0, vecOffset+112);
t1 = vec_perm(c5[0], c6[0], swiz1);
t2 = vec_perm(c5[0], c6[0], swiz2);
t3 = vec_perm(c7[0], c8[0], swiz1);
t4 = vec_perm(c7[0], c8[0], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
vec_xst(t5, 0, vecOffset+128);
vec_xst(t6, 0, vecOffset+144);
vec_xst(t7, 0, vecOffset+160);
vec_xst(t8, 0, vecOffset+176);
t1 = vec_perm(c5[1], c6[1], swiz1);
t2 = vec_perm(c5[1], c6[1], swiz2);
t3 = vec_perm(c7[1], c8[1], swiz1);
t4 = vec_perm(c7[1], c8[1], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
vec_xst(t5, 0, vecOffset+192);
vec_xst(t6, 0, vecOffset+208);
vec_xst(t7, 0, vecOffset+224);
vec_xst(t8, 0, vecOffset+240);
aoffset1 += lda;
aoffset2 += lda;
aoffset3 += lda;
aoffset4 += lda;
aoffset5 += lda;
aoffset6 += lda;
aoffset7 += lda;
aoffset8 += lda;
vecOffset += 256;
i--;
} while (i > 0);
}
j--;
} while (j > 0);
}
if (rows & 4) {
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset += 4 * lda;
i = (cols >> 2);
if (i > 0) {
do {
c1[1] = reinterpret_cast<VB>(vec_xl(0, aoffset1->qs));
c2[1] = reinterpret_cast<VB>(vec_xl(0, aoffset2->qs));
c3[1] = reinterpret_cast<VB>(vec_xl(0, aoffset3->qs));
c4[1] = reinterpret_cast<VB>(vec_xl(0, aoffset4->qs));
c1[0] = vec_and(c1[1], lowMask);
c1[1] = vec_sr(c1[1], v4);
c1[0] = vec_sub(c1[0], v8);
c1[1] = vec_sub(c1[1], v8);
vsum = vec_sum4s(c1[0], vsum);
vsum2 = vec_sum4s(c1[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[0] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats(0);
c2[0] = vec_and(c2[1], lowMask);
c2[1] = vec_sr(c2[1], v4);
c2[0] = vec_sub(c2[0], v8);
c2[1] = vec_sub(c2[1], v8);
vsum = vec_sum4s(c2[0], vsum);
vsum2 = vec_sum4s(c2[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[1] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats(0);
c3[0] = vec_and(c3[1], lowMask);
c3[1] = vec_sr(c3[1], v4);
c3[0] = vec_sub(c3[0], v8);
c3[1] = vec_sub(c3[1], v8);
vsum = vec_sum4s(c3[0], vsum);
vsum2 = vec_sum4s(c3[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[2] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats(0);
c4[0] = vec_and(c4[1], lowMask);
c4[1] = vec_sr(c4[1], v4);
c4[0] = vec_sub(c4[0], v8);
c4[1] = vec_sub(c4[1], v8);
vsum = vec_sum4s(c4[0], vsum);
vsum2 = vec_sum4s(c4[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[3] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats( 0);
t1 = vec_perm(c1[0], c2[0], swiz1);
t2 = vec_perm(c1[0], c2[0], swiz2);
t3 = vec_perm(c3[0], c4[0], swiz1);
t4 = vec_perm(c3[0], c4[0], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
vec_xst(t5, 0, vecOffset);
vec_xst(t6, 0, vecOffset+16);
vec_xst(t7, 0, vecOffset+32);
vec_xst(t8, 0, vecOffset+48);
t1 = vec_perm(c1[1], c2[1], swiz1);
t2 = vec_perm(c1[1], c2[1], swiz2);
t3 = vec_perm(c3[1], c4[1], swiz1);
t4 = vec_perm(c3[1], c4[1], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
vec_xst(t5, 0, vecOffset+64);
vec_xst(t6, 0, vecOffset+80);
vec_xst(t7, 0, vecOffset+96);
vec_xst(t8, 0, vecOffset+112);
aoffset1 += lda;
aoffset2 += lda;
aoffset3 += lda;
aoffset4 += lda;
vecOffset += 128;
i--;
} while (i > 0);
}
}
if (rows & 3) {
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
i = (cols >> 2);
if (i > 0) {
do {
switch(rows) {
case 3: c3[1] = reinterpret_cast<VB>(vec_xl(0, aoffset3->qs));
case 2: c2[1] = reinterpret_cast<VB>(vec_xl(0, aoffset2->qs));
case 1: c1[1] = reinterpret_cast<VB>(vec_xl(0, aoffset1->qs));
break;
}
c1[0] = vec_and(c1[1], lowMask);
c1[1] = vec_sr(c1[1], v4);
c1[0] = vec_sub(c1[0], v8);
c1[1] = vec_sub(c1[1], v8);
vsum = vec_sum4s(c1[0], vsum);
vsum2 = vec_sum4s(c1[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[0] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats(0);
c2[0] = vec_and(c2[1], lowMask);
c2[1] = vec_sr(c2[1], v4);
c2[0] = vec_sub(c2[0], v8);
c2[1] = vec_sub(c2[1], v8);
vsum = vec_sum4s(c2[0], vsum);
vsum2 = vec_sum4s(c2[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[1] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats(0);
c3[0] = vec_and(c3[1], lowMask);
c3[1] = vec_sr(c3[1], v4);
c3[0] = vec_sub(c3[0], v8);
c3[1] = vec_sub(c3[1], v8);
vsum = vec_sum4s(c3[0], vsum);
vsum2 = vec_sum4s(c3[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[2] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats(0);
c4[0] = vec_and(c4[1], lowMask);
c4[1] = vec_sr(c4[1], v4);
c4[0] = vec_sub(c4[0], v8);
c4[1] = vec_sub(c4[1], v8);
vsum = vec_sum4s(c4[0], vsum);
vsum2 = vec_sum4s(c4[1], vsum2);
vsum = vec_add(vsum, vsum2);
comparray[3] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
vsum = vec_splats(0);
vsum2 = vec_splats(0);
t1 = vec_perm(c1[0], c2[0], swiz1);
t2 = vec_perm(c1[0], c2[0], swiz2);
t3 = vec_perm(c3[0], c4[0], swiz1);
t4 = vec_perm(c3[0], c4[0], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
vec_xst(t5, 0, vecOffset);
vec_xst(t6, 0, vecOffset+16);
vec_xst(t7, 0, vecOffset+32);
vec_xst(t8, 0, vecOffset+48);
t1 = vec_perm(c1[1], c2[1], swiz1);
t2 = vec_perm(c1[1], c2[1], swiz2);
t3 = vec_perm(c3[1], c4[1], swiz1);
t4 = vec_perm(c3[1], c4[1], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
vec_xst(t5, 0, vecOffset+64);
vec_xst(t6, 0, vecOffset+80);
vec_xst(t7, 0, vecOffset+96);
vec_xst(t8, 0, vecOffset+112);
aoffset1 += lda;
aoffset2 += lda;
aoffset3 += lda;
vecOffset += 128;
i--;
} while(i > 0);
}
}
}
template<typename VA, typename VB>
void packNormal(const TB* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
int64_t i, j;
TB *aoffset = NULL;
VA *vecOffset = NULL;
TB *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
TB *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0};
VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0};
@@ -1502,24 +1111,24 @@ class tinyBLAS_Q0_PPC {
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
aoffset = const_cast<TB*>(a);
aoffset = const_cast<TA*>(a);
vecOffset = vec;
j = (rows >> 3);
if (j > 0) {
do {
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset5 = aoffset4 + lda;
aoffset6 = aoffset5 + lda;
aoffset7 = aoffset6 + lda;
aoffset8 = aoffset7 + lda;
aoffset += 8 * lda;
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset5 = aoffset4 + lda;
aoffset6 = aoffset5 + lda;
aoffset7 = aoffset6 + lda;
aoffset8 = aoffset7 + lda;
aoffset += 8 * lda;
i = (cols >> 3);
if (i > 0) {
do {
i = (cols >> 3);
if (i > 0) {
do {
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
@@ -1547,10 +1156,10 @@ class tinyBLAS_Q0_PPC {
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
if (flip == true) {
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
}
vec_xst(t5, 0, vecOffset);
vec_xst(t6, 0, vecOffset+16);
@@ -1566,10 +1175,10 @@ class tinyBLAS_Q0_PPC {
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
if (flip == true) {
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
}
vec_xst(t5, 0, vecOffset+64);
vec_xst(t6, 0, vecOffset+80);
@@ -1585,10 +1194,10 @@ class tinyBLAS_Q0_PPC {
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
if (flip == true) {
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
}
vec_xst(t5, 0, vecOffset+128);
vec_xst(t6, 0, vecOffset+144);
@@ -1604,10 +1213,10 @@ class tinyBLAS_Q0_PPC {
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
if (flip == true) {
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
}
vec_xst(t5, 0, vecOffset+192);
vec_xst(t6, 0, vecOffset+208);
@@ -1631,11 +1240,11 @@ class tinyBLAS_Q0_PPC {
}
if (rows & 4) {
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset += 4 * lda;
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset += 4 * lda;
i = (cols >> 3);
if (i > 0) {
@@ -1702,7 +1311,7 @@ class tinyBLAS_Q0_PPC {
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
i = (cols >> 3);
if (i > 0) {
if (i > 0) {
do {
switch(rows) {
case 3: C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
@@ -1918,18 +1527,13 @@ class tinyBLAS_Q0_PPC {
void KERNEL_4x8(int64_t ii, int64_t jj) {
vec_t vec_A[8], vec_B[16] = {0};
acc_t acc_0, acc_1;
std::array<int, 4> comparray {};
std::array<int, 4> comparray;
vector float fin_res[8] = {0};
vector float vs[8] = {0};
bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
for (int l = 0; l < k; l++) {
__builtin_mma_xxsetaccz(&acc_0);
__builtin_mma_xxsetaccz(&acc_1);
if (std::is_same_v<TA, block_q4_0>) {
packNormalInt4<int8_t, vector signed char, 4>((A+(ii*lda)+l), lda, 4, 4, (int8_t*)vec_A, comparray);
} else {
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
}
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
for(int x = 0; x < 8; x++) {
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
@@ -1941,17 +1545,15 @@ class tinyBLAS_Q0_PPC {
*((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
}
}
if (!isAblock_q4) {
auto aoffset = A+(ii*lda)+l;
for (int i = 0; i < 4; i++) {
comparray[i] = 0;
int ca = 0;
auto *at = aoffset->qs;
for (int j = 0; j < 32; j++)
ca += (int)*at++;
comparray[i] = ca;
aoffset += lda;
}
auto aoffset = A+(ii*lda)+l;
for (int i = 0; i < 4; i++) {
comparray[i] = 0;
int ca = 0;
const int8_t *at = aoffset->qs;
for (int j = 0; j < 32; j++)
ca += (int)*at++;
comparray[i] = ca;
aoffset += lda;
}
compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
@@ -1963,18 +1565,13 @@ class tinyBLAS_Q0_PPC {
void KERNEL_8x4(int64_t ii, int64_t jj) {
vec_t vec_A[16], vec_B[8] = {0};
acc_t acc_0, acc_1;
std::array<int, 8> comparray {};
std::array<int, 8> comparray;
vector float fin_res[8] = {0};
vector float vs[8] = {0};
bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
for (int l = 0; l < k; l++) {
__builtin_mma_xxsetaccz(&acc_0);
__builtin_mma_xxsetaccz(&acc_1);
if (std::is_same_v<TA, block_q4_0>) {
packNormalInt4<int8_t, vector signed char, 8>((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray);
} else {
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
}
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true);
for(int x = 0; x < 8; x++) {
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
@@ -1985,17 +1582,15 @@ class tinyBLAS_Q0_PPC {
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
}
}
if (!isAblock_q4) {
auto aoffset = A+(ii*lda)+l;
for (int i = 0; i < 8; i++) {
comparray[i] = 0;
int ca = 0;
auto *at = aoffset->qs;
for (int j = 0; j < 32; j++)
ca += (int)*at++;
comparray[i] = ca;
aoffset += lda;
}
auto aoffset = A+(ii*lda)+l;
for (int i = 0; i < 8; i++) {
comparray[i] = 0;
int ca = 0;
const int8_t *at = aoffset->qs;
for (int j = 0; j < 32; j++)
ca += (int)*at++;
comparray[i] = ca;
aoffset += lda;
}
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
@@ -2007,20 +1602,15 @@ class tinyBLAS_Q0_PPC {
void KERNEL_8x8(int64_t ii, int64_t jj) {
vec_t vec_A[16], vec_B[16] = {0};
acc_t acc_0, acc_1, acc_2, acc_3;
std::array<int, 8> comparray {};
std::array<int, 8> comparray;
vector float fin_res[16] = {0};
vector float vs[16] = {0};
bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
for (int l = 0; l < k; l++) {
__builtin_mma_xxsetaccz(&acc_0);
__builtin_mma_xxsetaccz(&acc_1);
__builtin_mma_xxsetaccz(&acc_2);
__builtin_mma_xxsetaccz(&acc_3);
if (std::is_same_v<TA, block_q4_0>) {
packNormalInt4<int8_t, vector signed char, 8>((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray);
} else {
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
}
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
for(int x = 0; x < 8; x++) {
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
@@ -2034,17 +1624,15 @@ class tinyBLAS_Q0_PPC {
*((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
}
}
if (!isAblock_q4) {
auto aoffset = A+(ii*lda)+l;
for (int i = 0; i < 8; i++) {
comparray[i] = 0;
int ca = 0;
auto *at = aoffset->qs;
for (int j = 0; j < 32; j++)
ca += (int)*at++;
comparray[i] = ca;
aoffset += lda;
}
auto aoffset = A+(ii*lda)+l;
for (int i = 0; i < 8; i++) {
comparray[i] = 0;
int ca = 0;
const int8_t *at = aoffset->qs;
for (int j = 0; j < 32; j++)
ca += (int)*at++;
comparray[i] = ca;
aoffset += lda;
}
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
@@ -2065,17 +1653,16 @@ class tinyBLAS_Q0_PPC {
int64_t duty = (tiles + nth - 1) / nth;
int64_t start = duty * ith;
int64_t end = start + duty;
vec_t vec_A[8] = {0}, vec_B[8] = {0};
vec_t vec_A[8], vec_B[8] = {0};
vector signed int vec_C[4];
acc_t acc_0;
bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
if (end > tiles)
end = tiles;
for (int64_t job = start; job < end; ++job) {
int64_t ii = m0 + job / xtiles * RM;
int64_t jj = n0 + job % xtiles * RN;
std::array<int, 4> comparray{};
std::array<int, RM> comparray;
vector float res[4] = {0};
vector float fin_res[4] = {0};
vector float vs[4] = {0};
@@ -2086,11 +1673,7 @@ class tinyBLAS_Q0_PPC {
__builtin_prefetch((A+(ii*lda)+(l+1))->qs, 0, 1); // prefetch one loop ahead
__builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead
__builtin_mma_xxsetaccz(&acc_0);
if (isAblock_q4) {
packNormalInt4<int8_t, vector signed char, 4>((A+(ii*lda)+l), lda, RM, 4, (int8_t*)vec_A, comparray);
} else {
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
}
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true);
for(int x = 0; x < 8; x+=4) {
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
@@ -2104,18 +1687,17 @@ class tinyBLAS_Q0_PPC {
}
}
__builtin_mma_disassemble_acc(vec_C, &acc_0);
if (!isAblock_q4) {
auto aoffset = A+(ii*lda)+l;
for (int i = 0; i < RM; i++) {
comparray[i] = 0;
int ca = 0;
auto *at = aoffset->qs;
for (int j = 0; j < 32; j++)
ca += (int)*at++;
comparray[i] = ca;
aoffset += lda;
}
auto aoffset = A+(ii*lda)+l;
for (int i = 0; i < RM; i++) {
comparray[i] = 0;
int ca = 0;
const int8_t *at = aoffset->qs;
for (int j = 0; j < 32; j++)
ca += (int)*at++;
comparray[i] = ca;
aoffset += lda;
}
for (int i = 0; i < RM; i++) {
CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0));
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
@@ -2431,7 +2013,6 @@ class tinyBLAS_PPC {
}
}
}
void KERNEL_4x4(int64_t ii, int64_t jj) {
vec_t vec_A[4], vec_B[4], vec_C[4];
acc_t acc_0;
@@ -2678,27 +2259,15 @@ class tinyBLAS_PPC {
vec_t vec_C[4];
acc_t acc_0;
__builtin_mma_xxsetaccz(&acc_0);
vec_t vec_A[4] {0}, vec_B[4] = {0};
vec_t vec_A[4], vec_B[4];
for (int l=0; l<k; l+=4) {
/* 'GEMV Forwarding' concept is used in first two conditional loops.
* when one of the matrix has a single row/column, the elements are
* broadcasted, instead of using packing routine to prepack the
* matrix elements.
*/
if (RM == 1) {
if (RN >= 4 && RM == 1) {
TA* a = const_cast<TA*>(A+(ii)*lda+l);
packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
vec_A[0] = (vec_t)vec_xl(0,a);
vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1));
vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2));
vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3));
} else if (RN == 1) {
packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
TB* b = const_cast<TB*>(B+(jj)*ldb+l);
vec_B[0] = (vec_t)vec_xl(0,b);
vec_B[1] = (vec_t)vec_splats(*((TB*)&vec_B+1));
vec_B[2] = (vec_t)vec_splats(*((TB*)&vec_B+2));
vec_B[3] = (vec_t)vec_splats(*((TB*)&vec_B+3));
} else {
packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
@@ -2802,10 +2371,8 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
assert(params->ith < params->nth);
// only enable sgemm for prompt processing
#if !defined(__MMA__)
if (n < 2)
return false;
#endif
if (Ctype != GGML_TYPE_F32)
return false;
@@ -2936,8 +2503,8 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
params->ith, params->nth};
tb.matmul(m, n);
return true;
#elif defined(__MMA__)
//TO-DO: Remove this condition once gemv forwarding is enabled.
if (n < 8 && n != 4)
return false;
if (m < 8 && m != 4)
@@ -2949,6 +2516,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
return false;
#endif
@@ -2973,19 +2541,6 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
params->ith, params->nth};
tb.matmul(m, n);
return true;
#elif defined(__MMA__)
//TO-DO: Remove this condition once gemv forwarding is enabled.
if (n < 8 && n != 4)
return false;
if (m < 8 && m != 4)
return false;
tinyBLAS_Q0_PPC<block_q4_0, block_q8_0, float> tb{
k, (const block_q4_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
return false;
#endif

View File

@@ -1,186 +0,0 @@
#include "unary-ops.h"
static inline float op_abs(float x) {
return fabsf(x);
}
static inline float op_sgn(float x) {
return (x > 0.f) ? 1.f : ((x < 0.f) ? -1.f : 0.f);
}
static inline float op_neg(float x) {
return -x;
}
static inline float op_step(float x) {
return (x > 0.f) ? 1.f : 0.f;
}
static inline float op_tanh(float x) {
return tanhf(x);
}
static inline float op_elu(float x) {
return (x > 0.f) ? x : expm1f(x);
}
static inline float op_relu(float x) {
return (x > 0.f) ? x : 0.f;
}
static inline float op_sigmoid(float x) {
return 1.f / (1.f + expf(-x));
}
static inline float op_hardsigmoid(float x) {
return fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
}
static inline float op_exp(float x) {
return expf(x);
}
static inline float op_hardswish(float x) {
return x * fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
}
static inline float op_sqr(float x) {
return x * x;
}
static inline float op_sqrt(float x) {
return sqrtf(x);
}
static inline float op_sin(float x) {
return sinf(x);
}
static inline float op_cos(float x) {
return cosf(x);
}
static inline float op_log(float x) {
return logf(x);
}
template <float (*op)(float), typename src0_t, typename dst_t>
static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
for (int i = 0; i < n; i++) {
y[i] = f32_to_dst(op(src0_to_f32(x[i])));
}
}
template <float (*op)(float), typename src0_t, typename dst_t>
static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
GGML_TENSOR_UNARY_OP_LOCALS
GGML_ASSERT( nb0 == sizeof(dst_t));
GGML_ASSERT(nb00 == sizeof(src0_t));
const auto [ir0, ir1] = get_thread_range(params, src0);
for (int64_t ir = ir0; ir < ir1; ++ir) {
const int64_t i03 = ir/(ne02*ne01);
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
vec_unary_op<op>(ne0, dst_ptr, src0_ptr);
}
}
// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
template <float (*op)(float)>
static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
/* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
apply_unary_op<op, float, float>(params, dst);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
apply_unary_op<op, ggml_bf16_t, float>(params, dst);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
apply_unary_op<op, ggml_fp16_t, float>(params, dst);
} else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
ggml_type_name(dst->type), ggml_type_name(src0->type));
GGML_ABORT("fatal error");
}
}
void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_abs>(params, dst);
}
void ggml_compute_forward_sgn(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_sgn>(params, dst);
}
void ggml_compute_forward_neg(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_neg>(params, dst);
}
void ggml_compute_forward_step(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_step>(params, dst);
}
void ggml_compute_forward_tanh(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_tanh>(params, dst);
}
void ggml_compute_forward_elu(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_elu>(params, dst);
}
void ggml_compute_forward_relu(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_relu>(params, dst);
}
void ggml_compute_forward_sigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_sigmoid>(params, dst);
}
void ggml_compute_forward_hardsigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_hardsigmoid>(params, dst);
}
void ggml_compute_forward_exp(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_exp>(params, dst);
}
void ggml_compute_forward_hardswish(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_hardswish>(params, dst);
}
void ggml_compute_forward_sqr(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_sqr>(params, dst);
}
void ggml_compute_forward_sqrt(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_sqrt>(params, dst);
}
void ggml_compute_forward_sin(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_sin>(params, dst);
}
void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_cos>(params, dst);
}
void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_log>(params, dst);
}

View File

@@ -1,28 +0,0 @@
#pragma once
#include "common.h"
#ifdef __cplusplus
extern "C" {
#endif
void ggml_compute_forward_abs(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_sgn(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_neg(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_step(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_tanh(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_elu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_sigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_hardsigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_exp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_hardswish(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_sqr(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
#ifdef __cplusplus
}
#endif

View File

@@ -41,18 +41,15 @@
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
#define GGML_CUDA_CC_PASCAL 600
#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#define GGML_CUDA_CC_VOLTA 700
#define GGML_CUDA_CC_TURING 750
#define GGML_CUDA_CC_AMPERE 800
#define GGML_CUDA_CC_ADA_LOVELACE 890
#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
#define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
#define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
#define GGML_CUDA_CC_PASCAL 600
#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#define GGML_CUDA_CC_VOLTA 700
#define GGML_CUDA_CC_TURING 750
#define GGML_CUDA_CC_AMPERE 800
#define GGML_CUDA_CC_ADA_LOVELACE 890
#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
// AMD
// GCN/CDNA, wave size is 64
// GCN/CNDA, wave size is 64
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
@@ -60,32 +57,21 @@
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
#define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
// Moore Threads
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
#define GGML_CUDA_CC_QY1 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
#define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
#define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT)
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
#define GGML_CUDA_CC_QY1 210
#define GGML_CUDA_CC_QY2 220
#ifdef __CUDA_ARCH_LIST__
constexpr bool ggml_cuda_has_arch_impl(int) {
@@ -211,9 +197,9 @@ typedef float2 dfloat2;
#define FP16_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3))
#define FP16_MMA_AVAILABLE
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3))
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
#define NEW_MMA_AVAILABLE
@@ -223,21 +209,21 @@ typedef float2 dfloat2;
#define CP_ASYNC_AVAILABLE
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
#define FLASH_ATTN_AVAILABLE
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
static bool fp16_available(const int cc) {
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
}
static bool fast_fp16_available(const int cc) {
return (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
return fp16_available(cc) && cc != 610;
}
// To be used for feature selection of external libraries, e.g. cuBLAS.
static bool fast_fp16_hardware_available(const int cc) {
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
}
// Any FP16 tensor core instructions are available for ggml code.
@@ -245,20 +231,20 @@ static bool fp16_mma_available(const int cc) {
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
return false;
#else
return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ||
GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
}
// To be used for feature selection of external libraries, e.g. cuBLAS.
static bool fp16_mma_hardware_available(const int cc) {
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA ||
GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
}
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
static bool new_mma_available(const int cc) {
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
}
static bool cp_async_available(const int cc) {
@@ -288,10 +274,6 @@ static __device__ void no_device_code(
__trap();
GGML_UNUSED(no_device_code); // suppress unused function warning
#if defined(GGML_USE_MUSA)
__builtin_unreachable();
#endif // defined(GGML_USE_MUSA)
}
#ifdef __CUDA_ARCH__
@@ -413,11 +395,11 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
c = __builtin_amdgcn_sdot4(a, b, c, false);
#elif defined(RDNA3) || defined(RDNA4)
#elif defined(RDNA3)
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
#elif defined(RDNA1) || defined(__gfx900__)
#elif defined(__gfx1010__) || defined(__gfx900__)
int tmp1;
int tmp2;
asm("\n \
@@ -696,7 +678,7 @@ struct ggml_tensor_extra_gpu {
};
#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS))
#if ((CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)) || defined(GGML_HIP_GRAPHS)
#define USE_CUDA_GRAPH
#endif

View File

@@ -38,7 +38,7 @@ static __global__ void concat_f32_dim1(const float * x, const float * y, float *
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;
if (blockIdx.y < (unsigned)ne01) { // src0
if (blockIdx.y < ne01) { // src0
int offset_src =
nidx +
blockIdx.y * ne0 +
@@ -64,7 +64,7 @@ static __global__ void concat_f32_dim2(const float * x, const float * y, float *
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;
if (blockIdx.z < (unsigned)ne02) { // src0
if (blockIdx.z < ne02) { // src0
int offset_src =
nidx +
blockIdx.y * ne0 +

Some files were not shown because too many files have changed in this diff Show More