Compare commits
232 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8e14e3ddb3 | ||
|
|
f4d7e54974 | ||
|
|
2256f36b79 | ||
|
|
7359016c7c | ||
|
|
813416991a | ||
|
|
5589921ef8 | ||
|
|
49f44b5c55 | ||
|
|
6685cc41c2 | ||
|
|
ceebbb5b21 | ||
|
|
6daa69ee81 | ||
|
|
fbf1ddec69 | ||
|
|
2aed77eb06 | ||
|
|
c82d18e863 | ||
|
|
14fef85e2d | ||
|
|
e76627bcce | ||
|
|
fbe7dfa53c | ||
|
|
172ac82629 | ||
|
|
d2f650cb5b | ||
|
|
35dec26cc2 | ||
|
|
d460510c72 | ||
|
|
2307523d32 | ||
|
|
0f648573dd | ||
|
|
b764b8f1d0 | ||
|
|
9241c3a2ac | ||
|
|
b2b2bf988c | ||
|
|
af4980bfed | ||
|
|
f2e69d28c0 | ||
|
|
39baaf55a1 | ||
|
|
6db2b41a76 | ||
|
|
753eafed0e | ||
|
|
e976423005 | ||
|
|
35a2ee9143 | ||
|
|
ec903c0341 | ||
|
|
a1d6df129b | ||
|
|
bbe7c56c99 | ||
|
|
62fead3ea0 | ||
|
|
15b4538ff2 | ||
|
|
7032f4f634 | ||
|
|
5f1925a8ce | ||
|
|
3b7c914de2 | ||
|
|
48c857aa10 | ||
|
|
413e7b0559 | ||
|
|
6dd3c28c9c | ||
|
|
38b431de23 | ||
|
|
aad0b01d73 | ||
|
|
1182cf4d4f | ||
|
|
fe54033b69 | ||
|
|
5eaf9964fc | ||
|
|
d292f4f204 | ||
|
|
256d1bb0dd | ||
|
|
faa3526a1e | ||
|
|
ddc5a5033f | ||
|
|
cd4fddb29f | ||
|
|
c9b316c78f | ||
|
|
bf63d695b8 | ||
|
|
1387ea2117 | ||
|
|
26d607608d | ||
|
|
44879ee885 | ||
|
|
9ecdd12e95 | ||
|
|
89758723c7 | ||
|
|
2bed4aa3f3 | ||
|
|
125d03a503 | ||
|
|
011e8ec577 | ||
|
|
6f9939d119 | ||
|
|
780e24a22e | ||
|
|
3ce7e8f8e7 | ||
|
|
b2d80e105a | ||
|
|
28603cd283 | ||
|
|
5e97ec91ae | ||
|
|
7251870780 | ||
|
|
fe8b3c0d4b | ||
|
|
f4dd059259 | ||
|
|
f7276f7500 | ||
|
|
15bceec2d7 | ||
|
|
d6bd4d46dd | ||
|
|
152d9d05e0 | ||
|
|
66d575c45c | ||
|
|
57744932c6 | ||
|
|
3466c6ebcf | ||
|
|
504dc37be8 | ||
|
|
05490fad7f | ||
|
|
6c5629d4d2 | ||
|
|
7dcbe39d36 | ||
|
|
726c0fa9a2 | ||
|
|
942c0107a7 | ||
|
|
b43ebde3b0 | ||
|
|
97c1549808 | ||
|
|
6df465a91d | ||
|
|
77bc1bbd05 | ||
|
|
48e2b13372 | ||
|
|
cca894f16a | ||
|
|
381ee19572 | ||
|
|
a5cacb22b2 | ||
|
|
9b75cb2b3c | ||
|
|
de9a147df1 | ||
|
|
7051aacfac | ||
|
|
2b3b999cac | ||
|
|
993fba8180 | ||
|
|
8b20858e5e | ||
|
|
57e2a7a52a | ||
|
|
9b6ea4263a | ||
|
|
821f0a271e | ||
|
|
96d7f56d29 | ||
|
|
2d5419d08a | ||
|
|
d391ae9b49 | ||
|
|
e9240cdfa0 | ||
|
|
b46757735d | ||
|
|
3e945cc1e9 | ||
|
|
ad19812cda | ||
|
|
682986a08e | ||
|
|
dcad445d0c | ||
|
|
1e605f4102 | ||
|
|
6b6916b215 | ||
|
|
38566680cd | ||
|
|
ba69bbc84c | ||
|
|
44a1a4a41a | ||
|
|
c918fe8dca | ||
|
|
0f83e727af | ||
|
|
4f4bf35f46 | ||
|
|
2b3a665d39 | ||
|
|
7563293665 | ||
|
|
f46c0c1b0e | ||
|
|
5c99960901 | ||
|
|
bee938da74 | ||
|
|
cec8a48470 | ||
|
|
334a835a1c | ||
|
|
4feb4b33ee | ||
|
|
959ef0c0df | ||
|
|
c37b3474e6 | ||
|
|
158f8c9e21 | ||
|
|
862f5e41ab | ||
|
|
3a48d558a6 | ||
|
|
7c8d3abd1a | ||
|
|
122ed4840c | ||
|
|
a0b3ac8c48 | ||
|
|
d75c232e1d | ||
|
|
e0324285a5 | ||
|
|
3e5ca7931c | ||
|
|
4483396751 | ||
|
|
d9aa4ffa6e | ||
|
|
ddb008d845 | ||
|
|
2faaef3979 | ||
|
|
4a3156de2f | ||
|
|
a836c8f534 | ||
|
|
467a882fd2 | ||
|
|
bb0c139247 | ||
|
|
9408cfdad6 | ||
|
|
03c5267490 | ||
|
|
a128c38de8 | ||
|
|
5f5fe1bd60 | ||
|
|
ac32902a87 | ||
|
|
147b17ac94 | ||
|
|
807179ec58 | ||
|
|
76484fbfd3 | ||
|
|
c71d608ce7 | ||
|
|
4be5ef556d | ||
|
|
0ea069b87b | ||
|
|
f172de03f1 | ||
|
|
2d57de5255 | ||
|
|
df845cc982 | ||
|
|
6b48ed0893 | ||
|
|
722d33f34e | ||
|
|
c30b1ef39a | ||
|
|
b38b5e93ae | ||
|
|
7dc78764e2 | ||
|
|
356327feb3 | ||
|
|
ee8243adaa | ||
|
|
15ebe59210 | ||
|
|
de473f5f8e | ||
|
|
f238461236 | ||
|
|
fa5c1fb44a | ||
|
|
52ee4540c0 | ||
|
|
3fe81781e3 | ||
|
|
e7e4df031b | ||
|
|
584d674be6 | ||
|
|
930f907d3e | ||
|
|
e790eef21c | ||
|
|
5537d9d36b | ||
|
|
1b280c9fff | ||
|
|
3cabe80630 | ||
|
|
4315a94366 | ||
|
|
2d00741e12 | ||
|
|
f445c0e68c | ||
|
|
326b418b59 | ||
|
|
1d118386fe | ||
|
|
7edefbd79c | ||
|
|
3ca63b4538 | ||
|
|
b037787548 | ||
|
|
469e75d0a3 | ||
|
|
49662cbed3 | ||
|
|
3ba5b8ca8e | ||
|
|
4330bd83fe | ||
|
|
27379455c3 | ||
|
|
eab6795006 | ||
|
|
d8d90aa343 | ||
|
|
43f76bf1c3 | ||
|
|
2f043328e3 | ||
|
|
2a7c94db5f | ||
|
|
64802ec00d | ||
|
|
3267c2abc7 | ||
|
|
f85a973aa1 | ||
|
|
5362e43962 | ||
|
|
e739de7909 | ||
|
|
c910e3c28a | ||
|
|
f34432ca1e | ||
|
|
7a9f75c38b | ||
|
|
5c1980d8d4 | ||
|
|
cd108e641d | ||
|
|
57d016ba2d | ||
|
|
329ff61569 | ||
|
|
d34633d8db | ||
|
|
4f56458d34 | ||
|
|
6efb8eb30e | ||
|
|
36e5a08b20 | ||
|
|
4dccb38d9a | ||
|
|
9a818f7c42 | ||
|
|
18adb4e9bb | ||
|
|
d9653894df | ||
|
|
128de3585b | ||
|
|
8c58330318 | ||
|
|
18c2e1752c | ||
|
|
8f900abfc0 | ||
|
|
1fc2f265ff | ||
|
|
a9a8c5de3d | ||
|
|
dd5ae06405 | ||
|
|
668b31fc7d | ||
|
|
42ea63c5a3 | ||
|
|
52531fdff8 | ||
|
|
b0034d93ce | ||
|
|
b7e7982953 | ||
|
|
226460cc0d | ||
|
|
d5a410e855 |
26
.devops/main-intel.Dockerfile
Normal file
@@ -0,0 +1,26 @@
|
||||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM intel/hpckit:$ONEAPI_VERSION as build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
|
||||
RUN mkdir build && \
|
||||
cd build && \
|
||||
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
|
||||
cmake --build . --config Release --target main server
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
|
||||
COPY --from=build /app/build/bin/main /main
|
||||
COPY --from=build /app/build/bin/server /server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/main" ]
|
||||
@@ -7,6 +7,18 @@
|
||||
{ system, ... }:
|
||||
{
|
||||
_module.args = {
|
||||
# Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
|
||||
# again, the below creates several nixpkgs instances which the
|
||||
# flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
|
||||
#
|
||||
# This is currently "slow" and "expensive", on a certain scale.
|
||||
# This also isn't "right" in that this hinders dependency injection at
|
||||
# the level of flake inputs. This might get removed in the foreseeable
|
||||
# future.
|
||||
#
|
||||
# Note that you can use these expressions without Nix
|
||||
# (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
|
||||
|
||||
pkgsCuda = import inputs.nixpkgs {
|
||||
inherit system;
|
||||
# Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
|
||||
|
||||
@@ -73,6 +73,7 @@ let
|
||||
ps: [
|
||||
ps.numpy
|
||||
ps.sentencepiece
|
||||
ps.tiktoken
|
||||
ps.torchWithoutCuda
|
||||
ps.transformers
|
||||
]
|
||||
@@ -114,14 +115,22 @@ effectiveStdenv.mkDerivation (
|
||||
pname = "llama-cpp${pnameSuffix}";
|
||||
version = llamaVersion;
|
||||
|
||||
# Note: none of the files discarded here are visible in the sandbox or
|
||||
# affect the output hash. This also means they can be modified without
|
||||
# triggering a rebuild.
|
||||
src = lib.cleanSourceWith {
|
||||
filter =
|
||||
name: type:
|
||||
!(builtins.any (_: _) [
|
||||
let
|
||||
noneOf = builtins.all (x: !x);
|
||||
baseName = baseNameOf name;
|
||||
in
|
||||
noneOf [
|
||||
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
|
||||
(name == "README.md") # Ignore *.md changes whe computing outPaths
|
||||
(lib.hasPrefix "." name) # Skip hidden files and directories
|
||||
]);
|
||||
(lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
|
||||
(lib.hasPrefix "." baseName) # Skip hidden files and directories
|
||||
(baseName == "flake.lock")
|
||||
];
|
||||
src = lib.cleanSource ../../.;
|
||||
};
|
||||
|
||||
@@ -159,7 +168,7 @@ effectiveStdenv.mkDerivation (
|
||||
|
||||
cmakeFlags =
|
||||
[
|
||||
(cmakeBool "LLAMA_NATIVE" true)
|
||||
(cmakeBool "LLAMA_NATIVE" false)
|
||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||
(cmakeBool "BUILD_SHARED_LIBS" true)
|
||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||
@@ -216,6 +225,9 @@ effectiveStdenv.mkDerivation (
|
||||
description = "contains numpy and sentencepiece";
|
||||
buildInputs = [ llama-python ];
|
||||
inputsFrom = [ finalAttrs.finalPackage ];
|
||||
shellHook = ''
|
||||
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
|
||||
'';
|
||||
};
|
||||
|
||||
shell-extra = mkShell {
|
||||
|
||||
@@ -4,6 +4,10 @@
|
||||
llamaVersion ? "0.0.0",
|
||||
}:
|
||||
|
||||
# We're using `makeScope` instead of just writing out an attrset
|
||||
# because it allows users to apply overlays later using `overrideScope'`.
|
||||
# Cf. https://noogle.dev/f/lib/makeScope
|
||||
|
||||
lib.makeScope newScope (
|
||||
self: {
|
||||
inherit llamaVersion;
|
||||
|
||||
32
.devops/server-cuda.Dockerfile
Normal file
@@ -0,0 +1,32 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=11.7.1
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
# Target the CUDA runtime image
|
||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
# Enable cuBLAS
|
||||
ENV LLAMA_CUBLAS=1
|
||||
|
||||
RUN make
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||
|
||||
COPY --from=build /app/server /server
|
||||
|
||||
ENTRYPOINT [ "/server" ]
|
||||
25
.devops/server-intel.Dockerfile
Normal file
@@ -0,0 +1,25 @@
|
||||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM intel/hpckit:$ONEAPI_VERSION as build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
|
||||
RUN mkdir build && \
|
||||
cd build && \
|
||||
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
|
||||
cmake --build . --config Release --target main server
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
|
||||
COPY --from=build /app/build/bin/server /server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/server" ]
|
||||
45
.devops/server-rocm.Dockerfile
Normal file
@@ -0,0 +1,45 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG ROCM_VERSION=5.6
|
||||
|
||||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
# This is mostly tied to rocBLAS supported archs.
|
||||
ARG ROCM_DOCKER_ARCH=\
|
||||
gfx803 \
|
||||
gfx900 \
|
||||
gfx906 \
|
||||
gfx908 \
|
||||
gfx90a \
|
||||
gfx1010 \
|
||||
gfx1030 \
|
||||
gfx1100 \
|
||||
gfx1101 \
|
||||
gfx1102
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||
# Enable ROCm
|
||||
ENV LLAMA_HIPBLAS=1
|
||||
ENV CC=/opt/rocm/llvm/bin/clang
|
||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||
|
||||
RUN make
|
||||
|
||||
ENTRYPOINT [ "/app/server" ]
|
||||
20
.devops/server.Dockerfile
Normal file
@@ -0,0 +1,20 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN make
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
|
||||
COPY --from=build /app/server /server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/server" ]
|
||||
1
.ecrc
@@ -1,4 +1,5 @@
|
||||
{
|
||||
"Exclude": ["^\\.gitmodules$"],
|
||||
"Disable": {
|
||||
"IndentSize": true
|
||||
}
|
||||
|
||||
103
.github/workflows/build.yml
vendored
@@ -72,7 +72,7 @@ jobs:
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest --verbose --timeout 900
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
ubuntu-latest-cmake-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
@@ -107,7 +107,7 @@ jobs:
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest --verbose --timeout 900
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
ubuntu-latest-cmake-mpi:
|
||||
runs-on: ubuntu-latest
|
||||
@@ -141,7 +141,48 @@ jobs:
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest --verbose
|
||||
ctest -L main --verbose
|
||||
|
||||
ubuntu-22-cmake-sycl:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
|
||||
|
||||
- name: install oneAPI dpcpp compiler
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install intel-oneapi-compiler-dpcpp-cpp
|
||||
|
||||
- name: install oneAPI MKL library
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt install intel-oneapi-mkl-devel
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||
# how to debug it.
|
||||
@@ -202,7 +243,7 @@ jobs:
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest --verbose --timeout 900
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
macOS-latest-cmake-ios:
|
||||
runs-on: macos-latest
|
||||
@@ -295,7 +336,8 @@ jobs:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
OPENCL_VERSION: 2023.04.17
|
||||
CLBLAST_VERSION: 1.6.0
|
||||
SDE_VERSION: 9.21.1-2023-04-24
|
||||
SDE_VERSION: 9.33.0-2024-01-07
|
||||
VULKAN_VERSION: 1.3.261.1
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -312,6 +354,8 @@ jobs:
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
- build: 'openblas'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'kompute'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -320,6 +364,12 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Clone Kompute submodule
|
||||
id: clone_kompute
|
||||
if: ${{ matrix.build == 'kompute' }}
|
||||
run: |
|
||||
git submodule update --init kompute
|
||||
|
||||
- name: Download OpenCL SDK
|
||||
id: get_opencl
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
@@ -354,6 +404,15 @@ jobs:
|
||||
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
|
||||
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'kompute' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
@@ -391,22 +450,23 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
|
||||
# not all machines have native AVX-512
|
||||
if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -C Release --verbose --timeout 900
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
- name: Test (Intel SDE)
|
||||
id: cmake_test_sde
|
||||
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||
# for some weird reason windows tar doesn't like sde tar.xz
|
||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
|
||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||
cd build
|
||||
& $sde -future -- ctest -C Release --verbose --timeout 900
|
||||
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
@@ -515,6 +575,31 @@ jobs:
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||
|
||||
android-build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v3
|
||||
with:
|
||||
java-version: 17
|
||||
distribution: zulu
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@v3
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cd examples/llama.android
|
||||
|
||||
# Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
|
||||
./gradlew build --no-daemon -Pskip-armeabi-v7a
|
||||
|
||||
# freeBSD-latest:
|
||||
# runs-on: macos-12
|
||||
# steps:
|
||||
|
||||
5
.github/workflows/docker.yml
vendored
@@ -28,13 +28,18 @@ jobs:
|
||||
config:
|
||||
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
||||
# have disabled them for now until the reason why
|
||||
# is understood.
|
||||
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
62
.github/workflows/nix-ci-aarch64.yml
vendored
Normal file
@@ -0,0 +1,62 @@
|
||||
name: Nix aarch64 builds
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
schedule:
|
||||
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
|
||||
# 1.5h instead of minutes with the cold cache).
|
||||
#
|
||||
# randint(0, 59), randint(0, 23)
|
||||
- cron: '26 12 * * *'
|
||||
# But also rebuild if we touched any of the Nix expressions:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['**/*.nix', 'flake.lock']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/*.nix', 'flake.lock']
|
||||
|
||||
jobs:
|
||||
nix-build-aarch64:
|
||||
if: ${{ vars.CACHIX_NAME != '' }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install QEMU
|
||||
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y qemu-user-static qemu-system-aarch64
|
||||
sudo usermod -a -G kvm $USER
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@v9
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-platforms = aarch64-linux
|
||||
extra-system-features = nixos-test kvm
|
||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
- name: Set-up cachix to push the results to
|
||||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: ${{ vars.CACHIX_NAME }}
|
||||
- name: Show all output paths
|
||||
run: >
|
||||
nix run github:nix-community/nix-eval-jobs
|
||||
-- --gc-roots-dir gcroot
|
||||
--flake
|
||||
".#packages.aarch64-linux"
|
||||
- name: Build
|
||||
run: >
|
||||
nix run github:Mic92/nix-fast-build
|
||||
-- --skip-cached --no-nom
|
||||
--systems aarch64-linux
|
||||
--flake
|
||||
".#checks.aarch64-linux"
|
||||
43
.github/workflows/nix-ci.yml
vendored
@@ -5,10 +5,8 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
|
||||
|
||||
jobs:
|
||||
nix-eval:
|
||||
@@ -69,44 +67,3 @@ jobs:
|
||||
-- --skip-cached --no-nom
|
||||
--flake
|
||||
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
||||
nix-build-aarch64:
|
||||
if: ${{ vars.CACHIX_NAME != '' }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install QEMU
|
||||
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
|
||||
run: |
|
||||
sudo apt-get install -y qemu-user-static qemu-system-aarch64
|
||||
sudo usermod -a -G kvm $USER
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@v9
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-platforms = aarch64-linux
|
||||
extra-system-features = nixos-test kvm
|
||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
- name: Set-up cachix to push the results to
|
||||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: ${{ vars.CACHIX_NAME }}
|
||||
- name: Show all output paths
|
||||
run: >
|
||||
nix run github:nix-community/nix-eval-jobs
|
||||
-- --gc-roots-dir gcroot
|
||||
--flake
|
||||
".#packages.aarch64-linux"
|
||||
- name: Build
|
||||
run: >
|
||||
nix run github:Mic92/nix-fast-build
|
||||
-- --skip-cached --no-nom
|
||||
--systems aarch64-linux
|
||||
--flake
|
||||
".#checks.aarch64-linux"
|
||||
|
||||
2
.github/workflows/nix-flake-update.yml
vendored
@@ -19,4 +19,4 @@ jobs:
|
||||
pr-labels: |
|
||||
nix
|
||||
pr-reviewers: philiptaron,SomeoneSerge
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
token: ${{ secrets.FLAKE_TOKEN }}
|
||||
|
||||
19
.gitignore
vendored
@@ -27,7 +27,7 @@
|
||||
lcov-report/
|
||||
gcovr-report/
|
||||
|
||||
build*/
|
||||
build*
|
||||
out/
|
||||
tmp/
|
||||
|
||||
@@ -43,6 +43,7 @@ models-mnt
|
||||
/embedding
|
||||
/gguf
|
||||
/gguf-llama-simple
|
||||
/imatrix
|
||||
/infill
|
||||
/libllama.so
|
||||
/llama-bench
|
||||
@@ -88,19 +89,3 @@ examples/jeopardy/results.txt
|
||||
|
||||
poetry.lock
|
||||
poetry.toml
|
||||
|
||||
# Test binaries
|
||||
/tests/test-grammar-parser
|
||||
/tests/test-llama-grammar
|
||||
/tests/test-double-float
|
||||
/tests/test-grad0
|
||||
/tests/test-opt
|
||||
/tests/test-quantize-fns
|
||||
/tests/test-quantize-perf
|
||||
/tests/test-sampling
|
||||
/tests/test-tokenizer-0-llama
|
||||
/tests/test-tokenizer-0-falcon
|
||||
/tests/test-tokenizer-1-llama
|
||||
/tests/test-tokenizer-1-bpe
|
||||
/tests/test-rope
|
||||
/tests/test-backend-ops
|
||||
|
||||
3
.gitmodules
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
[submodule "kompute"]
|
||||
path = kompute
|
||||
url = https://github.com/nomic-ai/kompute.git
|
||||
335
CMakeLists.txt
@@ -1,5 +1,6 @@
|
||||
cmake_minimum_required(VERSION 3.13) # for add_link_options
|
||||
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
||||
project("llama.cpp" C CXX)
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
|
||||
@@ -47,6 +48,7 @@ option(BUILD_SHARED_LIBS "build shared libraries"
|
||||
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
||||
option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
|
||||
option(LLAMA_LTO "llama: enable link time optimization" OFF)
|
||||
option(LLAMA_CCACHE "llama: use ccache if available" ON)
|
||||
|
||||
# debug
|
||||
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
|
||||
@@ -76,6 +78,10 @@ if (NOT MSVC)
|
||||
option(LLAMA_F16C "llama: enable F16C" ${INS_ENB})
|
||||
endif()
|
||||
|
||||
if (WIN32)
|
||||
option(LLAMA_WIN_VER "llama: Windows Version" 0x602)
|
||||
endif()
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
||||
@@ -93,24 +99,39 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
|
||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
||||
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
||||
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
||||
|
||||
|
||||
# add perf arguments
|
||||
option(LLAMA_PERF "llama: enable perf" OFF)
|
||||
if (LLAMA_PERF)
|
||||
add_definitions(-DGGML_PERF)
|
||||
endif()
|
||||
|
||||
# Required for relocatable CMake package
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||
|
||||
#
|
||||
# Compile flags
|
||||
#
|
||||
if (LLAMA_SYCL)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
else()
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
endif()
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
set(CMAKE_C_STANDARD_REQUIRED true)
|
||||
@@ -397,6 +418,28 @@ if (LLAMA_CLBLAST)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_VULKAN)
|
||||
find_package(Vulkan)
|
||||
if (Vulkan_FOUND)
|
||||
message(STATUS "Vulkan found")
|
||||
|
||||
set(GGML_HEADERS_VULKAN ggml-vulkan.h)
|
||||
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
|
||||
|
||||
add_library(ggml-vulkan STATIC ggml-vulkan.cpp ggml-vulkan.h)
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(ggml-vulkan PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
|
||||
|
||||
add_compile_definitions(GGML_USE_VULKAN)
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-vulkan)
|
||||
else()
|
||||
message(WARNING "Vulkan not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_HIPBLAS)
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
|
||||
|
||||
@@ -442,6 +485,185 @@ if (LLAMA_HIPBLAS)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_SYCL)
|
||||
if ( NOT DEFINED ENV{ONEAPI_ROOT})
|
||||
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
|
||||
endif()
|
||||
#todo: AOT
|
||||
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
if (LLAMA_SYCL_F16)
|
||||
add_compile_definitions(GGML_SYCL_F16)
|
||||
endif()
|
||||
add_compile_definitions(GGML_USE_SYCL)
|
||||
|
||||
add_compile_options(-I./) #include DPCT
|
||||
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
||||
|
||||
set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
|
||||
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
endif()
|
||||
|
||||
if (LLAMA_KOMPUTE)
|
||||
add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
|
||||
find_package(Vulkan COMPONENTS glslc REQUIRED)
|
||||
find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
|
||||
if (NOT glslc_executable)
|
||||
message(FATAL_ERROR "glslc not found")
|
||||
endif()
|
||||
|
||||
function(compile_shader)
|
||||
set(options)
|
||||
set(oneValueArgs)
|
||||
set(multiValueArgs SOURCES)
|
||||
cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
foreach(source ${compile_shader_SOURCES})
|
||||
get_filename_component(filename ${source} NAME)
|
||||
set(spv_file ${filename}.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${spv_file}
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
|
||||
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
|
||||
COMMENT "Compiling ${source} to ${spv_file}"
|
||||
)
|
||||
|
||||
get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
|
||||
set(FILE_NAME "shader${RAW_FILE_NAME}")
|
||||
string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
|
||||
string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
|
||||
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
|
||||
set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
|
||||
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
|
||||
if(CMAKE_GENERATOR MATCHES "Visual Studio")
|
||||
add_custom_command(
|
||||
OUTPUT ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
DEPENDS ${spv_file} xxd
|
||||
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
|
||||
)
|
||||
else()
|
||||
add_custom_command(
|
||||
OUTPUT ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
DEPENDS ${spv_file} xxd
|
||||
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
|
||||
)
|
||||
endif()
|
||||
endforeach()
|
||||
endfunction()
|
||||
|
||||
if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
|
||||
message(STATUS "Kompute found")
|
||||
set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
|
||||
add_subdirectory(kompute)
|
||||
|
||||
# Compile our shaders
|
||||
compile_shader(SOURCES
|
||||
kompute-shaders/op_scale.comp
|
||||
kompute-shaders/op_scale_8.comp
|
||||
kompute-shaders/op_add.comp
|
||||
kompute-shaders/op_addrow.comp
|
||||
kompute-shaders/op_mul.comp
|
||||
kompute-shaders/op_silu.comp
|
||||
kompute-shaders/op_relu.comp
|
||||
kompute-shaders/op_gelu.comp
|
||||
kompute-shaders/op_softmax.comp
|
||||
kompute-shaders/op_norm.comp
|
||||
kompute-shaders/op_rmsnorm.comp
|
||||
kompute-shaders/op_diagmask.comp
|
||||
kompute-shaders/op_mul_mat_mat_f32.comp
|
||||
kompute-shaders/op_mul_mat_f16.comp
|
||||
kompute-shaders/op_mul_mat_q8_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_1.comp
|
||||
kompute-shaders/op_mul_mat_q6_k.comp
|
||||
kompute-shaders/op_getrows_f16.comp
|
||||
kompute-shaders/op_getrows_q4_0.comp
|
||||
kompute-shaders/op_getrows_q4_1.comp
|
||||
kompute-shaders/op_getrows_q6_k.comp
|
||||
kompute-shaders/op_rope_f16.comp
|
||||
kompute-shaders/op_rope_f32.comp
|
||||
kompute-shaders/op_cpy_f16_f16.comp
|
||||
kompute-shaders/op_cpy_f16_f32.comp
|
||||
kompute-shaders/op_cpy_f32_f16.comp
|
||||
kompute-shaders/op_cpy_f32_f32.comp
|
||||
)
|
||||
|
||||
# Create a custom target for our generated shaders
|
||||
add_custom_target(generated_shaders DEPENDS
|
||||
shaderop_scale.h
|
||||
shaderop_scale_8.h
|
||||
shaderop_add.h
|
||||
shaderop_addrow.h
|
||||
shaderop_mul.h
|
||||
shaderop_silu.h
|
||||
shaderop_relu.h
|
||||
shaderop_gelu.h
|
||||
shaderop_softmax.h
|
||||
shaderop_norm.h
|
||||
shaderop_rmsnorm.h
|
||||
shaderop_diagmask.h
|
||||
shaderop_mul_mat_mat_f32.h
|
||||
shaderop_mul_mat_f16.h
|
||||
shaderop_mul_mat_q8_0.h
|
||||
shaderop_mul_mat_q4_0.h
|
||||
shaderop_mul_mat_q4_1.h
|
||||
shaderop_mul_mat_q6_k.h
|
||||
shaderop_getrows_f16.h
|
||||
shaderop_getrows_q4_0.h
|
||||
shaderop_getrows_q4_1.h
|
||||
shaderop_getrows_q6_k.h
|
||||
shaderop_rope_f16.h
|
||||
shaderop_rope_f32.h
|
||||
shaderop_cpy_f16_f16.h
|
||||
shaderop_cpy_f16_f32.h
|
||||
shaderop_cpy_f32_f16.h
|
||||
shaderop_cpy_f32_f32.h
|
||||
)
|
||||
|
||||
# Create a custom command that depends on the generated_shaders
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
|
||||
COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
|
||||
DEPENDS generated_shaders
|
||||
COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
|
||||
)
|
||||
|
||||
# Add the stamp to the main sources to ensure dependency tracking
|
||||
set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
|
||||
set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
|
||||
add_compile_definitions(GGML_USE_KOMPUTE)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
|
||||
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
|
||||
else()
|
||||
message(WARNING "Kompute not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
function(get_flags CCID CCVER)
|
||||
set(C_FLAGS "")
|
||||
set(CXX_FLAGS "")
|
||||
@@ -454,17 +676,24 @@ function(get_flags CCID CCVER)
|
||||
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
||||
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
||||
)
|
||||
set(C_FLAGS ${C_FLAGS} -Wdouble-promotion)
|
||||
list(APPEND C_FLAGS -Wdouble-promotion)
|
||||
endif()
|
||||
elseif (CCID STREQUAL "GNU")
|
||||
set(C_FLAGS -Wdouble-promotion)
|
||||
set(CXX_FLAGS -Wno-array-bounds)
|
||||
|
||||
if (CCVER VERSION_GREATER_EQUAL 7.1.0)
|
||||
set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation)
|
||||
list(APPEND CXX_FLAGS -Wno-format-truncation)
|
||||
endif()
|
||||
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
||||
set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
|
||||
list(APPEND CXX_FLAGS -Wextra-semi)
|
||||
endif()
|
||||
elseif (CCID MATCHES "Intel")
|
||||
if (NOT LLAMA_SYCL)
|
||||
# enable max optimization level when using Intel compiler
|
||||
set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
|
||||
set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
|
||||
add_link_options(-fuse-ld=lld -static-intel)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -493,16 +722,18 @@ if (LLAMA_ALL_WARNINGS)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(CUDA_CXX_FLAGS "")
|
||||
|
||||
if (LLAMA_CUBLAS)
|
||||
set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
|
||||
if (NOT MSVC)
|
||||
set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic)
|
||||
list(APPEND CUDA_FLAGS -Wno-pedantic)
|
||||
endif()
|
||||
|
||||
if (LLAMA_ALL_WARNINGS AND NOT MSVC)
|
||||
set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
|
||||
if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
|
||||
set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER})
|
||||
list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
|
||||
endif()
|
||||
|
||||
execute_process(
|
||||
@@ -530,13 +761,8 @@ if (LLAMA_CUBLAS)
|
||||
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
|
||||
|
||||
get_flags(${CUDA_CCID} ${CUDA_CCVER})
|
||||
list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS) # pass host compiler flags as a single argument
|
||||
if (NOT CUDA_CXX_FLAGS STREQUAL "")
|
||||
set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS})
|
||||
endif()
|
||||
list(APPEND CUDA_CXX_FLAGS ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
|
||||
endif()
|
||||
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
|
||||
endif()
|
||||
|
||||
if (WIN32)
|
||||
@@ -557,6 +783,17 @@ if (LLAMA_LTO)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_CCACHE)
|
||||
find_program(LLAMA_CCACHE_FOUND ccache)
|
||||
if (LLAMA_CCACHE_FOUND)
|
||||
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
|
||||
set(ENV{CCACHE_SLOPPINESS} time_macros)
|
||||
message(STATUS "Using ccache")
|
||||
else()
|
||||
message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF")
|
||||
endif ()
|
||||
endif()
|
||||
|
||||
# this version of Apple ld64 is buggy
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
|
||||
@@ -590,6 +827,8 @@ if (NOT MSVC)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(ARCH_FLAGS "")
|
||||
|
||||
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
|
||||
message(STATUS "ARM detected")
|
||||
if (MSVC)
|
||||
@@ -601,19 +840,19 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
|
||||
else()
|
||||
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
||||
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
||||
add_compile_options(-mfp16-format=ieee)
|
||||
list(APPEND ARCH_FLAGS -mfp16-format=ieee)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
||||
# Raspberry Pi 1, Zero
|
||||
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
||||
# Raspberry Pi 2
|
||||
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
add_compile_options(-mno-unaligned-access)
|
||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||
endif()
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
|
||||
@@ -624,8 +863,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
|
||||
include(cmake/FindSIMD.cmake)
|
||||
endif ()
|
||||
if (LLAMA_AVX512)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX512)
|
||||
# MSVC has no compile-time flags enabling specific
|
||||
# AVX512 extensions, neither it defines the
|
||||
# macros corresponding to the extensions.
|
||||
@@ -639,54 +877,64 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
||||
endif()
|
||||
elseif (LLAMA_AVX2)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX2)
|
||||
elseif (LLAMA_AVX)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX)
|
||||
endif()
|
||||
else()
|
||||
if (LLAMA_NATIVE)
|
||||
add_compile_options(-march=native)
|
||||
list(APPEND ARCH_FLAGS -march=native)
|
||||
endif()
|
||||
if (LLAMA_F16C)
|
||||
add_compile_options(-mf16c)
|
||||
list(APPEND ARCH_FLAGS -mf16c)
|
||||
endif()
|
||||
if (LLAMA_FMA)
|
||||
add_compile_options(-mfma)
|
||||
list(APPEND ARCH_FLAGS -mfma)
|
||||
endif()
|
||||
if (LLAMA_AVX)
|
||||
add_compile_options(-mavx)
|
||||
list(APPEND ARCH_FLAGS -mavx)
|
||||
endif()
|
||||
if (LLAMA_AVX2)
|
||||
add_compile_options(-mavx2)
|
||||
list(APPEND ARCH_FLAGS -mavx2)
|
||||
endif()
|
||||
if (LLAMA_AVX512)
|
||||
add_compile_options(-mavx512f)
|
||||
add_compile_options(-mavx512bw)
|
||||
list(APPEND ARCH_FLAGS -mavx512f)
|
||||
list(APPEND ARCH_FLAGS -mavx512bw)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VBMI)
|
||||
add_compile_options(-mavx512vbmi)
|
||||
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VNNI)
|
||||
add_compile_options(-mavx512vnni)
|
||||
list(APPEND ARCH_FLAGS -mavx512vnni)
|
||||
endif()
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
message(STATUS "PowerPC detected")
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
add_compile_options(-mcpu=powerpc64le)
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
||||
else()
|
||||
add_compile_options(-mcpu=native -mtune=native)
|
||||
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
||||
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Unknown architecture")
|
||||
endif()
|
||||
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
||||
|
||||
if (LLAMA_CUBLAS)
|
||||
list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
|
||||
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
|
||||
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
|
||||
list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
|
||||
endif()
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
|
||||
endif()
|
||||
|
||||
if (MINGW)
|
||||
# Target Windows 8 for PrefetchVirtualMemory
|
||||
add_compile_definitions(_WIN32_WINNT=0x602)
|
||||
add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
|
||||
endif()
|
||||
|
||||
#
|
||||
@@ -758,11 +1006,14 @@ add_library(ggml OBJECT
|
||||
ggml-backend.h
|
||||
ggml-quants.c
|
||||
ggml-quants.h
|
||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
|
||||
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
||||
)
|
||||
|
||||
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||
@@ -838,8 +1089,8 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
|
||||
|
||||
set(GGML_PUBLIC_HEADERS "ggml.h"
|
||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
||||
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
|
||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" "${GGML_HEADERS_VULKAN}"
|
||||
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
|
||||
37
Makefile
@@ -1,6 +1,6 @@
|
||||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = \
|
||||
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
||||
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
|
||||
|
||||
@@ -9,7 +9,7 @@ TEST_TARGETS = \
|
||||
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
||||
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
||||
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
||||
tests/test-backend-ops
|
||||
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
|
||||
|
||||
# Code coverage output files
|
||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||
@@ -43,10 +43,6 @@ ifeq ($(UNAME_S),Darwin)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
|
||||
BUILD_TARGETS += metal
|
||||
endif
|
||||
|
||||
default: $(BUILD_TARGETS)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
@@ -452,6 +448,19 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif # LLAMA_CLBLAST
|
||||
|
||||
ifdef LLAMA_VULKAN
|
||||
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
||||
MK_LDFLAGS += -lvulkan
|
||||
OBJS += ggml-vulkan.o
|
||||
|
||||
ifdef LLAMA_VULKAN_CHECK_RESULTS
|
||||
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
|
||||
endif
|
||||
|
||||
ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif # LLAMA_VULKAN
|
||||
|
||||
ifdef LLAMA_HIPBLAS
|
||||
|
||||
ifeq ($(wildcard /opt/rocm),)
|
||||
@@ -614,13 +623,16 @@ quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.
|
||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
|
||||
|
||||
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
||||
@@ -668,11 +680,6 @@ lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
swift: examples/batched.swift
|
||||
(cd examples/batched.swift; make build)
|
||||
@@ -753,3 +760,9 @@ tests/test-c.o: tests/test-c.c llama.h
|
||||
|
||||
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
@@ -14,14 +14,14 @@ let package = Package(
|
||||
.library(name: "llama", targets: ["llama"]),
|
||||
],
|
||||
dependencies: [
|
||||
.package(url: "https://github.com/ggerganov/ggml.git", .branch("master"))
|
||||
.package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
|
||||
],
|
||||
targets: [
|
||||
.target(
|
||||
name: "llama",
|
||||
dependencies: ["ggml"],
|
||||
path: ".",
|
||||
exclude: [],
|
||||
exclude: ["ggml-metal.metal"],
|
||||
sources: [
|
||||
"llama.cpp",
|
||||
],
|
||||
|
||||
46
README.md
@@ -10,10 +10,10 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
### Hot topics
|
||||
|
||||
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
|
||||
- Collecting Apple Silicon performance stats:
|
||||
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||
- A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
|
||||
- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
|
||||
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
|
||||
|
||||
----
|
||||
@@ -62,7 +62,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
|
||||
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||
- Mixed F16 / F32 precision
|
||||
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
|
||||
- CUDA, Metal and OpenCL GPU backend support
|
||||
- CUDA, Metal, OpenCL, SYCL GPU backend support
|
||||
|
||||
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
|
||||
Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
|
||||
@@ -111,6 +111,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
|
||||
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
|
||||
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
|
||||
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
|
||||
|
||||
|
||||
**Bindings:**
|
||||
@@ -118,14 +119,17 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
||||
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
|
||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
|
||||
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
|
||||
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
|
||||
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
|
||||
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
|
||||
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
|
||||
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
|
||||
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
|
||||
|
||||
**UI:**
|
||||
|
||||
@@ -135,6 +139,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
- [semperai/amica](https://github.com/semperai/amica)
|
||||
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
|
||||
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
|
||||
- [iohub/collama](https://github.com/iohub/coLLaMA)
|
||||
|
||||
---
|
||||
|
||||
@@ -285,7 +290,7 @@ In order to build llama.cpp you have three different options.
|
||||
sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
|
||||
opencl clblast openblas
|
||||
|
||||
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
|
||||
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
|
||||
```
|
||||
|
||||
**Notes:** With this packages you can build llama.cpp with OPENBLAS and
|
||||
@@ -593,15 +598,24 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|
||||
You can get a list of platforms and devices from the `clinfo -l` command, etc.
|
||||
|
||||
- #### SYCL
|
||||
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
|
||||
|
||||
llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
||||
|
||||
For detailed info, please refer to [llama.cpp for SYCL](README_sycl.md).
|
||||
|
||||
|
||||
### Prepare Data & Run
|
||||
|
||||
```bash
|
||||
# obtain the original LLaMA model weights and place them in ./models
|
||||
ls ./models
|
||||
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
|
||||
# [Optional] for models using BPE tokenizers
|
||||
ls ./models
|
||||
65B 30B 13B 7B vocab.json
|
||||
# [Optional] for models using BPE tokenizers
|
||||
ls ./models
|
||||
65B 30B 13B 7B vocab.json
|
||||
|
||||
# install Python dependencies
|
||||
python3 -m pip install -r requirements.txt
|
||||
@@ -609,8 +623,8 @@ python3 -m pip install -r requirements.txt
|
||||
# convert the 7B model to ggml FP16 format
|
||||
python3 convert.py models/7B/
|
||||
|
||||
# [Optional] for models using BPE tokenizers
|
||||
python convert.py models/7B/ --vocabtype bpe
|
||||
# [Optional] for models using BPE tokenizers
|
||||
python convert.py models/7B/ --vocabtype bpe
|
||||
|
||||
# quantize the model to 4-bits (using q4_0 method)
|
||||
./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
|
||||
@@ -926,17 +940,20 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
|
||||
* Create a folder to store big models & intermediate files (ex. /llama/models)
|
||||
|
||||
#### Images
|
||||
We have two Docker images available for this project:
|
||||
We have three Docker images available for this project:
|
||||
|
||||
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executabhle file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
|
||||
Additionally, there the following images, similar to the above:
|
||||
|
||||
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
|
||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
||||
|
||||
@@ -962,6 +979,12 @@ or with a light image:
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
```
|
||||
|
||||
or with a server image:
|
||||
|
||||
```bash
|
||||
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
|
||||
```
|
||||
|
||||
### Docker With CUDA
|
||||
|
||||
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
|
||||
@@ -971,6 +994,7 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
|
||||
```bash
|
||||
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
||||
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
|
||||
docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
|
||||
```
|
||||
|
||||
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
||||
@@ -984,6 +1008,7 @@ The resulting images, are essentially the same as the non-CUDA images:
|
||||
|
||||
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
|
||||
3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
|
||||
|
||||
#### Usage
|
||||
|
||||
@@ -992,6 +1017,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
|
||||
```bash
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||
```
|
||||
|
||||
### Contributing
|
||||
|
||||
252
README_sycl.md
Normal file
@@ -0,0 +1,252 @@
|
||||
# llama.cpp for SYCL
|
||||
|
||||
[Background](#background)
|
||||
|
||||
[OS](#os)
|
||||
|
||||
[Intel GPU](#intel-gpu)
|
||||
|
||||
[Linux](#linux)
|
||||
|
||||
[Environment Variable](#environment-variable)
|
||||
|
||||
[Known Issue](#known-issue)
|
||||
|
||||
[Todo](#todo)
|
||||
|
||||
## Background
|
||||
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
|
||||
|
||||
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
|
||||
|
||||
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
|
||||
|
||||
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
|
||||
|
||||
The llama.cpp for SYCL is used to support Intel GPUs.
|
||||
|
||||
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
||||
|
||||
## OS
|
||||
|
||||
|OS|Status|Verified|
|
||||
|-|-|-|
|
||||
|Linux|Support|Ubuntu 22.04|
|
||||
|Windows|Ongoing| |
|
||||
|
||||
|
||||
## Intel GPU
|
||||
|
||||
|Intel GPU| Status | Verified Model|
|
||||
|-|-|-|
|
||||
|Intel Data Center Max Series| Support| Max 1550|
|
||||
|Intel Data Center Flex Series| Support| Flex 170|
|
||||
|Intel Arc Series| Support| Arc 770|
|
||||
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
|
||||
|
||||
|
||||
## Linux
|
||||
|
||||
### Setup Environment
|
||||
|
||||
1. Install Intel GPU driver.
|
||||
|
||||
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
|
||||
|
||||
Note: for iGPU, please install the client GPU driver.
|
||||
|
||||
b. Add user to group: video, render.
|
||||
|
||||
```
|
||||
sudo usermod -aG render username
|
||||
sudo usermod -aG video username
|
||||
```
|
||||
|
||||
Note: re-login to enable it.
|
||||
|
||||
c. Check
|
||||
|
||||
```
|
||||
sudo apt install clinfo
|
||||
sudo clinfo -l
|
||||
```
|
||||
|
||||
Output (example):
|
||||
|
||||
```
|
||||
Platform #0: Intel(R) OpenCL Graphics
|
||||
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
|
||||
|
||||
|
||||
Platform #0: Intel(R) OpenCL HD Graphics
|
||||
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
|
||||
```
|
||||
|
||||
2. Install Intel® oneAPI Base toolkit.
|
||||
|
||||
|
||||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||
|
||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||
|
||||
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||
|
||||
b. Check
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
sycl-ls
|
||||
```
|
||||
|
||||
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
|
||||
Output (example):
|
||||
```
|
||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||
|
||||
```
|
||||
|
||||
2. Build locally:
|
||||
|
||||
```
|
||||
mkdir -p build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
|
||||
|
||||
#for FP32
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main only
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
./examples/sycl/build.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||
|
||||
### Run
|
||||
|
||||
1. Put model file to folder **models**
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
```
|
||||
|
||||
3. List device ID
|
||||
|
||||
Run without parameter:
|
||||
|
||||
```
|
||||
./build/bin/ls-sycl-device
|
||||
|
||||
or
|
||||
|
||||
./build/bin/main
|
||||
```
|
||||
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
|-|-|
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
|
||||
4. Set device ID and execute llama.cpp
|
||||
|
||||
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||
|
||||
```
|
||||
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
```
|
||||
or run by script:
|
||||
|
||||
```
|
||||
./examples/sycl/run_llama2.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||
|
||||
|
||||
5. Check the device ID in output
|
||||
|
||||
Like:
|
||||
```
|
||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
```
|
||||
|
||||
|
||||
## Environment Variable
|
||||
|
||||
#### Build
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|
||||
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|
||||
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|
||||
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
|
||||
|
||||
#### Running
|
||||
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
||||
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||
|
||||
## Known Issue
|
||||
|
||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
|
||||
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
|
||||
|
||||
|
||||
- Hang during startup
|
||||
|
||||
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
|
||||
|
||||
Solution: add **--no-mmap**.
|
||||
|
||||
## Todo
|
||||
|
||||
- Support to build in Windows.
|
||||
|
||||
- Support multiple cards.
|
||||
@@ -43,7 +43,7 @@ Example for llama model
|
||||
# For llama7b and llama2 models
|
||||
python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
|
||||
# For mistral and mpt models
|
||||
python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
|
||||
python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
|
||||
```
|
||||
|
||||
## Quantize
|
||||
|
||||
@@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
# with CUDA support
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
# with SYCL support
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
```
|
||||
|
||||
113
ci/run.sh
@@ -10,6 +10,9 @@
|
||||
# # with CUDA support
|
||||
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
# # with SYCL support
|
||||
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
|
||||
if [ -z "$2" ]; then
|
||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||
@@ -22,9 +25,9 @@ mkdir -p "$2"
|
||||
OUT=$(realpath "$1")
|
||||
MNT=$(realpath "$2")
|
||||
|
||||
rm -v $OUT/*.log
|
||||
rm -v $OUT/*.exit
|
||||
rm -v $OUT/*.md
|
||||
rm -f "$OUT/*.log"
|
||||
rm -f "$OUT/*.exit"
|
||||
rm -f "$OUT/*.md"
|
||||
|
||||
sd=`dirname $0`
|
||||
cd $sd/../
|
||||
@@ -36,6 +39,18 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||
if [ -z ${ONEAPI_ROOT} ]; then
|
||||
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
|
||||
fi
|
||||
## helpers
|
||||
|
||||
# download a file if it does not exist or if it is outdated
|
||||
@@ -90,7 +105,7 @@ function gg_run_ctest_debug {
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
}
|
||||
@@ -119,9 +134,9 @@ function gg_run_ctest_release {
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
else
|
||||
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
fi
|
||||
|
||||
set +e
|
||||
@@ -137,6 +152,61 @@ function gg_sum_ctest_release {
|
||||
gg_printf '```\n'
|
||||
}
|
||||
|
||||
function gg_get_model {
|
||||
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
|
||||
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
||||
if [[ -s $gguf_3b ]]; then
|
||||
echo -n "$gguf_3b"
|
||||
elif [[ -s $gguf_7b ]]; then
|
||||
echo -n "$gguf_7b"
|
||||
else
|
||||
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
function gg_run_ctest_with_model_debug {
|
||||
cd ${SRC}
|
||||
|
||||
local model; model=$(gg_get_model)
|
||||
cd build-ci-debug
|
||||
set -e
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
set +e
|
||||
cd ..
|
||||
}
|
||||
|
||||
function gg_run_ctest_with_model_release {
|
||||
cd ${SRC}
|
||||
|
||||
local model; model=$(gg_get_model)
|
||||
cd build-ci-release
|
||||
set -e
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
set +e
|
||||
cd ..
|
||||
}
|
||||
|
||||
function gg_sum_ctest_with_model_debug {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Runs ctest with model files in debug mode\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '```\n'
|
||||
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
||||
gg_printf '```\n'
|
||||
}
|
||||
|
||||
function gg_sum_ctest_with_model_release {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Runs ctest with model files in release mode\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '```\n'
|
||||
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
||||
gg_printf '```\n'
|
||||
}
|
||||
|
||||
# open_llama_3b_v2
|
||||
|
||||
function gg_run_open_llama_3b_v2 {
|
||||
@@ -160,8 +230,8 @@ function gg_run_open_llama_3b_v2 {
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert.py ${path_models}
|
||||
|
||||
@@ -214,6 +284,8 @@ function gg_run_open_llama_3b_v2 {
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
@@ -241,6 +313,8 @@ function gg_run_open_llama_3b_v2 {
|
||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
|
||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||
|
||||
# lora
|
||||
function compare_ppl {
|
||||
qnt="$1"
|
||||
@@ -282,7 +356,6 @@ function gg_run_open_llama_3b_v2 {
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
||||
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
@@ -292,6 +365,7 @@ function gg_sum_open_llama_3b_v2 {
|
||||
gg_printf 'OpenLLaMA 3B-v2:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
@@ -337,8 +411,8 @@ function gg_run_open_llama_7b_v2 {
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert.py ${path_models}
|
||||
|
||||
@@ -391,6 +465,8 @@ function gg_run_open_llama_7b_v2 {
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
@@ -418,6 +494,8 @@ function gg_run_open_llama_7b_v2 {
|
||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
|
||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||
|
||||
# lora
|
||||
function compare_ppl {
|
||||
qnt="$1"
|
||||
@@ -469,6 +547,7 @@ function gg_sum_open_llama_7b_v2 {
|
||||
gg_printf 'OpenLLaMA 7B-v2:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
@@ -492,14 +571,18 @@ function gg_sum_open_llama_7b_v2 {
|
||||
## main
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
||||
rm -rf ${SRC}/models-mnt
|
||||
|
||||
mnt_models=${MNT}/models
|
||||
mkdir -p ${mnt_models}
|
||||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||
|
||||
python3 -m pip install -r ${SRC}/requirements.txt
|
||||
python3 -m pip install --editable gguf-py
|
||||
# Create a fresh python3 venv and enter it
|
||||
python3 -m venv "$MNT/venv"
|
||||
source "$MNT/venv/bin/activate"
|
||||
|
||||
pip install -r ${SRC}/requirements.txt --disable-pip-version-check
|
||||
pip install --editable gguf-py --disable-pip-version-check
|
||||
fi
|
||||
|
||||
ret=0
|
||||
@@ -514,6 +597,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
else
|
||||
test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||
fi
|
||||
test $ret -eq 0 && gg_run ctest_with_model_debug
|
||||
test $ret -eq 0 && gg_run ctest_with_model_release
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
@@ -42,6 +42,10 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
|
||||
#define GGML_USE_CUBLAS_SYCL
|
||||
#endif
|
||||
|
||||
int32_t get_num_physical_cores() {
|
||||
#ifdef __linux__
|
||||
// enumerate the set of thread siblings, num entries is num cores
|
||||
@@ -167,6 +171,24 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
if (params.n_threads_batch <= 0) {
|
||||
params.n_threads_batch = std::thread::hardware_concurrency();
|
||||
}
|
||||
} else if (arg == "-td" || arg == "--threads-draft") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads_draft = std::stoi(argv[i]);
|
||||
if (params.n_threads_draft <= 0) {
|
||||
params.n_threads_draft = std::thread::hardware_concurrency();
|
||||
}
|
||||
} else if (arg == "-tbd" || arg == "--threads-batch-draft") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads_batch_draft = std::stoi(argv[i]);
|
||||
if (params.n_threads_batch_draft <= 0) {
|
||||
params.n_threads_batch_draft = std::thread::hardware_concurrency();
|
||||
}
|
||||
} else if (arg == "-p" || arg == "--prompt") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -185,6 +207,23 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
params.prompt_cache_all = true;
|
||||
} else if (arg == "--prompt-cache-ro") {
|
||||
params.prompt_cache_ro = true;
|
||||
} else if (arg == "-bf" || arg == "--binary-file") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::ifstream file(argv[i], std::ios::binary);
|
||||
if (!file) {
|
||||
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
// store the external file name in params
|
||||
params.prompt_file = argv[i];
|
||||
std::ostringstream ss;
|
||||
ss << file.rdbuf();
|
||||
params.prompt = ss.str();
|
||||
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
|
||||
} else if (arg == "-f" || arg == "--file") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -220,6 +259,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
} else if (arg == "--grp-attn-n" || arg == "-gan") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
|
||||
params.grp_attn_n = std::stoi(argv[i]);
|
||||
} else if (arg == "--grp-attn-w" || arg == "-gaw") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
|
||||
params.grp_attn_w = std::stoi(argv[i]);
|
||||
} else if (arg == "--rope-freq-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -529,9 +582,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
params.n_gpu_layers = std::stoi(argv[i]);
|
||||
#else
|
||||
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
@@ -540,9 +592,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
||||
#else
|
||||
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
@@ -551,25 +602,45 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
params.main_gpu = std::stoi(argv[i]);
|
||||
#else
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
|
||||
#endif
|
||||
#ifndef GGML_USE_CUBLAS_SYCL
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS_SYCL
|
||||
} else if (arg == "--split-mode" || arg == "-sm") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::string arg_next = argv[i];
|
||||
if (arg_next == "none") {
|
||||
params.split_mode = LLAMA_SPLIT_NONE;
|
||||
} else if (arg_next == "layer") {
|
||||
params.split_mode = LLAMA_SPLIT_LAYER;
|
||||
} else if (arg_next == "row") {
|
||||
params.split_mode = LLAMA_SPLIT_ROW;
|
||||
} else {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifndef GGML_USE_CUBLAS_SYCL
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS_SYCL
|
||||
|
||||
} else if (arg == "--tensor-split" || arg == "-ts") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
std::string arg_next = argv[i];
|
||||
|
||||
// split string by , and /
|
||||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||
|
||||
if (split_arg.size() >= LLAMA_MAX_DEVICES) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
|
||||
if (i < split_arg.size()) {
|
||||
params.tensor_split[i] = std::stof(split_arg[i]);
|
||||
@@ -577,21 +648,17 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
params.tensor_split[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
#else
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
params.mul_mat_q = false;
|
||||
#else
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#ifndef GGML_USE_CUBLAS_SYCL
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS_SYCL
|
||||
} else if (arg == "--no-mmap") {
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--numa") {
|
||||
params.numa = true;
|
||||
} else if (arg == "--verbose-prompt") {
|
||||
params.verbose_prompt = true;
|
||||
} else if (arg == "--no-display-prompt") {
|
||||
params.display_prompt = false;
|
||||
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -608,6 +675,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
|
||||
params.logdir += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
} else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.logits_file = argv[i];
|
||||
} else if (arg == "--perplexity" || arg == "--all-logits") {
|
||||
params.logits_all = true;
|
||||
} else if (arg == "--ppl-stride") {
|
||||
@@ -616,6 +689,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.ppl_stride = std::stoi(argv[i]);
|
||||
} else if (arg == "-ptc" || arg == "--print-token-count") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_print = std::stoi(argv[i]);
|
||||
} else if (arg == "--ppl-output-type") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -630,6 +709,24 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.hellaswag_tasks = std::stoi(argv[i]);
|
||||
} else if (arg == "--winogrande") {
|
||||
params.winogrande = true;
|
||||
} else if (arg == "--winogrande-tasks") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.winogrande_tasks = std::stoi(argv[i]);
|
||||
} else if (arg == "--multiple-choice") {
|
||||
params.multiple_choice = true;
|
||||
} else if (arg == "--multiple-choice-tasks") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.multiple_choice_tasks = std::stoi(argv[i]);
|
||||
} else if (arg == "--kl-divergence") {
|
||||
params.kl_divergence = true;
|
||||
} else if (arg == "--ignore-eos") {
|
||||
params.ignore_eos = true;
|
||||
} else if (arg == "--no-penalize-nl") {
|
||||
@@ -798,7 +895,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf("\n");
|
||||
printf("options:\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" --version show version and build info\n");
|
||||
printf(" --version show version and build info\n");
|
||||
printf(" -i, --interactive run in interactive mode\n");
|
||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||
@@ -812,6 +909,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
|
||||
printf(" -tb N, --threads-batch N\n");
|
||||
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||
printf(" -td N, --threads-draft N");
|
||||
printf(" number of threads to use during generation (default: same as --threads)");
|
||||
printf(" -tbd N, --threads-batch-draft N\n");
|
||||
printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
|
||||
printf(" -p PROMPT, --prompt PROMPT\n");
|
||||
printf(" prompt to start generation with (default: empty)\n");
|
||||
printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
|
||||
@@ -825,6 +926,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
|
||||
printf(" -f FNAME, --file FNAME\n");
|
||||
printf(" prompt file to start generation.\n");
|
||||
printf(" -bf FNAME, --binary-file FNAME\n");
|
||||
printf(" binary file containing multiple choice tasks.\n");
|
||||
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
@@ -871,6 +974,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
||||
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
||||
printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
|
||||
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
|
||||
printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
|
||||
printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
|
||||
printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base");
|
||||
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
|
||||
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||
@@ -895,16 +1003,22 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" number of layers to store in VRAM\n");
|
||||
printf(" -ngld N, --n-gpu-layers-draft N\n");
|
||||
printf(" number of layers to store in VRAM for the draft model\n");
|
||||
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
||||
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
printf(" -nommq, --no-mul-mat-q\n");
|
||||
printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
|
||||
printf(" Not recommended since this is both slower and uses more VRAM.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#endif
|
||||
printf(" --verbose-prompt print prompt before generation\n");
|
||||
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
|
||||
printf(" how to split the model across multiple GPUs, one of:\n");
|
||||
printf(" - none: use one GPU only\n");
|
||||
printf(" - layer (default): split layers and KV across GPUs\n");
|
||||
printf(" - row: split rows across GPUs\n");
|
||||
printf(" -ts SPLIT, --tensor-split SPLIT\n");
|
||||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
||||
#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
||||
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
||||
printf(" -gan N, --grp-attn-n N\n");
|
||||
printf(" group-attention factor (default: %d)\n", params.grp_attn_n);
|
||||
printf(" -gaw N, --grp-attn-w N\n");
|
||||
printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
|
||||
printf(" -dkvc, --dump-kv-cache\n");
|
||||
printf(" verbose print of the KV cache\n");
|
||||
printf(" -nkvo, --no-kv-offload\n");
|
||||
@@ -926,6 +1040,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" -ptc N, --print-token-count N\n");
|
||||
printf(" print token count every N tokens (default: %d)\n", params.n_print);
|
||||
printf("\n");
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_print_usage();
|
||||
@@ -1015,6 +1131,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
@@ -1029,6 +1146,9 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||
}
|
||||
|
||||
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||
if (s == "f32") {
|
||||
return GGML_TYPE_F32;
|
||||
}
|
||||
if (s == "f16") {
|
||||
return GGML_TYPE_F16;
|
||||
}
|
||||
@@ -1399,7 +1519,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||
@@ -1540,6 +1659,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
@@ -46,7 +46,9 @@ struct gpt_params {
|
||||
uint32_t seed = -1; // RNG seed
|
||||
|
||||
int32_t n_threads = get_num_physical_cores();
|
||||
int32_t n_threads_draft = -1;
|
||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||
int32_t n_threads_batch_draft = -1;
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
@@ -59,9 +61,13 @@ struct gpt_params {
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||
int32_t grp_attn_n = 1; // group-attention factor
|
||||
int32_t grp_attn_w = 512; // group-attention width
|
||||
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
||||
@@ -85,6 +91,7 @@ struct gpt_params {
|
||||
std::string input_suffix = ""; // string to suffix user inputs with
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
std::string logdir = ""; // directory in which to save YAML log files
|
||||
std::string logits_file = ""; // file for saving *all* logits
|
||||
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
|
||||
@@ -99,6 +106,14 @@ struct gpt_params {
|
||||
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
||||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||
|
||||
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
|
||||
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
|
||||
|
||||
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
|
||||
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
|
||||
|
||||
bool kl_divergence = false; // compute KL-divergence
|
||||
|
||||
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
||||
bool random_prompt = false; // do not randomize prompt if none provided
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
@@ -122,6 +137,7 @@ struct gpt_params {
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool numa = false; // attempt optimizations that help on some NUMA systems
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
bool display_prompt = true; // print prompt before generation
|
||||
bool infill = false; // use infill mode
|
||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||
bool no_kv_offload = false; // disable KV offloading
|
||||
@@ -240,4 +256,3 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||
|
||||
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
||||
// will be empty (default) if there are parse errors
|
||||
if (result->parsed_grammar.rules.empty()) {
|
||||
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
|
||||
delete result;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -129,6 +130,8 @@ static void sampler_queue(
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||
|
||||
const float temp = params.temp;
|
||||
const float dynatemp_range = params.dynatemp_range;
|
||||
const float dynatemp_exponent = params.dynatemp_exponent;
|
||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float min_p = params.min_p;
|
||||
@@ -143,7 +146,15 @@ static void sampler_queue(
|
||||
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
||||
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
||||
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
||||
case 't': llama_sample_temp (ctx_main, &cur_p, temp); break;
|
||||
case 't':
|
||||
if (dynatemp_range > 0) {
|
||||
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
||||
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
|
||||
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
|
||||
} else {
|
||||
llama_sample_temp(ctx_main, &cur_p, temp);
|
||||
}
|
||||
break;
|
||||
default : break;
|
||||
}
|
||||
}
|
||||
@@ -190,6 +201,11 @@ static llama_token llama_sampling_sample_impl(
|
||||
logits[it->first] += it->second;
|
||||
}
|
||||
|
||||
if (ctx_cfg) {
|
||||
float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
|
||||
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
|
||||
}
|
||||
|
||||
cur.clear();
|
||||
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
@@ -198,10 +214,6 @@ static llama_token llama_sampling_sample_impl(
|
||||
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||
|
||||
if (ctx_cfg) {
|
||||
llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale);
|
||||
}
|
||||
|
||||
// apply penalties
|
||||
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
|
||||
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
|
||||
|
||||
@@ -17,7 +17,9 @@ typedef struct llama_sampling_params {
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typical_p = 1.00f; // 1.0 = disabled
|
||||
float temp = 0.80f; // 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
|
||||
@@ -10,7 +10,7 @@ import re
|
||||
import sys
|
||||
from enum import IntEnum
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
|
||||
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -23,6 +23,15 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
import gguf
|
||||
|
||||
|
||||
# check for any of the given keys in the dictionary and return the value of the first key found
|
||||
def get_key_opts(d, keys):
|
||||
for k in keys:
|
||||
if k in d:
|
||||
return d[k]
|
||||
print(f"Could not find any of {keys}")
|
||||
sys.exit()
|
||||
|
||||
|
||||
###### MODEL DEFINITIONS ######
|
||||
|
||||
class SentencePieceTokenTypes(IntEnum):
|
||||
@@ -180,6 +189,8 @@ class Model:
|
||||
return StableLMModel
|
||||
if model_architecture == "QWenLMHeadModel":
|
||||
return QwenModel
|
||||
if model_architecture == "Qwen2ForCausalLM":
|
||||
return Model
|
||||
if model_architecture == "MixtralForCausalLM":
|
||||
return MixtralModel
|
||||
if model_architecture == "GPT2LMHeadModel":
|
||||
@@ -188,6 +199,10 @@ class Model:
|
||||
return Phi2Model
|
||||
if model_architecture == "PlamoForCausalLM":
|
||||
return PlamoModel
|
||||
if model_architecture == "CodeShellForCausalLM":
|
||||
return CodeShellModel
|
||||
if model_architecture == "OrionForCausalLM":
|
||||
return OrionModel
|
||||
return Model
|
||||
|
||||
def _is_model_safetensors(self) -> bool:
|
||||
@@ -225,6 +240,8 @@ class Model:
|
||||
return gguf.MODEL_ARCH.STABLELM
|
||||
if arch == "QWenLMHeadModel":
|
||||
return gguf.MODEL_ARCH.QWEN
|
||||
if arch == "Qwen2ForCausalLM":
|
||||
return gguf.MODEL_ARCH.QWEN2
|
||||
if arch == "MixtralForCausalLM":
|
||||
return gguf.MODEL_ARCH.LLAMA
|
||||
if arch == "GPT2LMHeadModel":
|
||||
@@ -233,6 +250,10 @@ class Model:
|
||||
return gguf.MODEL_ARCH.PHI2
|
||||
if arch == "PlamoForCausalLM":
|
||||
return gguf.MODEL_ARCH.PLAMO
|
||||
if arch == "CodeShellForCausalLM":
|
||||
return gguf.MODEL_ARCH.CODESHELL
|
||||
if arch == "OrionForCausalLM":
|
||||
return gguf.MODEL_ARCH.ORION
|
||||
|
||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||
|
||||
@@ -272,6 +293,58 @@ class Model:
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_qwen(self):
|
||||
dir_model = self.dir_model
|
||||
hparams = self.hparams
|
||||
tokens: list[bytearray] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
||||
vocab_size = hparams["vocab_size"]
|
||||
assert max(tokenizer.get_vocab().values()) < vocab_size
|
||||
|
||||
merges = []
|
||||
vocab = {}
|
||||
mergeable_ranks = tokenizer.mergeable_ranks
|
||||
for token, rank in mergeable_ranks.items():
|
||||
vocab[QwenModel.token_bytes_to_string(token)] = rank
|
||||
if len(token) == 1:
|
||||
continue
|
||||
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
|
||||
assert len(merged) == 2
|
||||
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
|
||||
|
||||
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
||||
added_vocab = tokenizer.special_tokens
|
||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
pad_token = f"[PAD{i}]".encode("utf-8")
|
||||
tokens.append(bytearray(pad_token))
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
||||
special_vocab.merges = merges
|
||||
# only add special tokens when they were not already loaded from config.json
|
||||
if len(special_vocab.special_token_ids) == 0:
|
||||
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
|
||||
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
|
||||
# this one is usually not in config.json anyway
|
||||
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_sentencepiece(self):
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
@@ -470,7 +543,8 @@ class MPTModel(Model):
|
||||
# map tensor names
|
||||
if "scales" in name:
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
|
||||
new_name = new_name.replace("scales", "act.scales")
|
||||
if new_name is not None:
|
||||
new_name = new_name.replace("scales", "act.scales")
|
||||
else:
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
@@ -502,6 +576,83 @@ class MPTModel(Model):
|
||||
self.gguf_writer.add_tensor("output.weight", data)
|
||||
|
||||
|
||||
class OrionModel(Model):
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["num_hidden_layers"]
|
||||
head_count = self.hparams["num_attention_heads"]
|
||||
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
||||
hf_repo = self.hparams.get("_name_or_path", "")
|
||||
|
||||
ctx_length = 0
|
||||
if "max_sequence_length" in self.hparams:
|
||||
ctx_length = self.hparams["max_sequence_length"]
|
||||
elif "max_position_embeddings" in self.hparams:
|
||||
ctx_length = self.hparams["max_position_embeddings"]
|
||||
elif "model_max_length" in self.hparams:
|
||||
ctx_length = self.hparams["model_max_length"]
|
||||
else:
|
||||
print("gguf: can not find ctx length parameter.")
|
||||
sys.exit()
|
||||
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_source_hf_repo(hf_repo)
|
||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||
self.gguf_writer.add_context_length(ctx_length)
|
||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||
self.gguf_writer.add_head_count(head_count)
|
||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
|
||||
|
||||
def write_tensors(self):
|
||||
# Collect tensors from generator object
|
||||
model_kv = dict(self.get_tensors())
|
||||
block_count = self.hparams["num_hidden_layers"]
|
||||
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||
|
||||
for name, data_torch in model_kv.items():
|
||||
# we don't need these
|
||||
if name.endswith(".rotary_emb.inv_freq"):
|
||||
continue
|
||||
|
||||
old_dtype = data_torch.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
|
||||
data = data_torch.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if self.ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
class BaichuanModel(Model):
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
@@ -817,10 +968,17 @@ class PersimmonModel(Model):
|
||||
hidden_size = self.hparams["hidden_size"]
|
||||
|
||||
self.gguf_writer.add_name('persimmon-8b-chat')
|
||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_embedding_length(hidden_size)
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||
self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
|
||||
|
||||
# NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
|
||||
# than the head size?
|
||||
# ref: https://github.com/ggerganov/llama.cpp/pull/4889
|
||||
# self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
|
||||
self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
|
||||
|
||||
self.gguf_writer.add_head_count(head_count)
|
||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
|
||||
@@ -852,6 +1010,13 @@ class PersimmonModel(Model):
|
||||
|
||||
|
||||
class StableLMModel(Model):
|
||||
def set_vocab(self):
|
||||
if (self.dir_model / "tokenizer.json").is_file():
|
||||
self._set_vocab_gpt2()
|
||||
else:
|
||||
# StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
|
||||
self._set_vocab_qwen()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
@@ -880,7 +1045,7 @@ class QwenModel(Model):
|
||||
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
|
||||
|
||||
@staticmethod
|
||||
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
|
||||
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
|
||||
parts = [bytes([b]) for b in token]
|
||||
while True:
|
||||
min_idx = None
|
||||
@@ -897,52 +1062,7 @@ class QwenModel(Model):
|
||||
return parts
|
||||
|
||||
def set_vocab(self):
|
||||
dir_model = self.dir_model
|
||||
hparams = self.hparams
|
||||
tokens: list[bytearray] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
||||
vocab_size = hparams["vocab_size"]
|
||||
assert max(tokenizer.get_vocab().values()) < vocab_size
|
||||
|
||||
merges = []
|
||||
vocab = {}
|
||||
mergeable_ranks = tokenizer.mergeable_ranks
|
||||
for token, rank in mergeable_ranks.items():
|
||||
vocab[self.token_bytes_to_string(token)] = rank
|
||||
if len(token) == 1:
|
||||
continue
|
||||
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
|
||||
assert len(merged) == 2
|
||||
merges.append(' '.join(map(self.token_bytes_to_string, merged)))
|
||||
|
||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
|
||||
added_vocab = tokenizer.special_tokens
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
pad_token = f"[PAD{i}]".encode("utf-8")
|
||||
tokens.append(bytearray(pad_token))
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
||||
special_vocab.merges = merges
|
||||
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
|
||||
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
|
||||
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
self._set_vocab_qwen()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name("Qwen")
|
||||
@@ -1061,17 +1181,22 @@ class GPT2Model(Model):
|
||||
|
||||
class Phi2Model(Model):
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["n_layer"]
|
||||
block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"])
|
||||
|
||||
rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
|
||||
n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"])
|
||||
n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"])
|
||||
|
||||
self.gguf_writer.add_name("Phi2")
|
||||
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
||||
self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"]))
|
||||
|
||||
self.gguf_writer.add_embedding_length(n_embd)
|
||||
self.gguf_writer.add_feed_forward_length(4 * n_embd)
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["n_head"])
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||
self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"])
|
||||
self.gguf_writer.add_head_count(n_head)
|
||||
self.gguf_writer.add_head_count_kv(n_head)
|
||||
self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"]))
|
||||
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
@@ -1155,6 +1280,70 @@ class PlamoModel(Model):
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
class CodeShellModel(Model):
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["n_layer"]
|
||||
|
||||
self.gguf_writer.add_name("CodeShell")
|
||||
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_rope_freq_base(10000.0)
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(1.0)
|
||||
|
||||
def write_tensors(self):
|
||||
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
||||
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||
tensors = dict(self.get_tensors())
|
||||
has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
|
||||
for name, data_torch in tensors.items():
|
||||
# we don't need these
|
||||
if name.endswith((".attn.rotary_emb.inv_freq")):
|
||||
continue
|
||||
|
||||
old_dtype = data_torch.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
|
||||
data = data_torch.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if self.ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
if not has_lm_head and name == "transformer.wte.weight":
|
||||
self.gguf_writer.add_tensor("output.weight", data)
|
||||
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
@@ -1192,7 +1381,7 @@ def main() -> None:
|
||||
|
||||
if args.awq_path:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
|
||||
from awq.apply_awq import add_scale_weights
|
||||
from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
|
||||
tmp_model_path = args.model / "weighted_model"
|
||||
dir_model = tmp_model_path
|
||||
if tmp_model_path.is_dir():
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from enum import IntEnum
|
||||
@@ -9,7 +10,6 @@ from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
import os
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||
import gguf
|
||||
@@ -371,15 +371,11 @@ def handle_metadata(cfg, hp):
|
||||
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
|
||||
else:
|
||||
raise ValueError('Unable to load metadata')
|
||||
vocab = convert.load_vocab(
|
||||
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
||||
cfg.vocabtype)
|
||||
# FIXME: Respect cfg.vocab_dir?
|
||||
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
|
||||
load_merges = cfg.vocabtype == 'bpe',
|
||||
n_vocab = vocab.vocab_size)
|
||||
vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
|
||||
vocab_factory = convert.VocabFactory(vocab_path)
|
||||
vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
|
||||
convert.check_vocab_size(params, vocab)
|
||||
return (params, vocab, svocab)
|
||||
return params, vocab, special_vocab
|
||||
|
||||
|
||||
def handle_args():
|
||||
|
||||
@@ -5,17 +5,16 @@ import json
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, BinaryIO, Sequence
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from pathlib import Path
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
|
||||
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
||||
|
||||
|
||||
@@ -60,7 +59,14 @@ if __name__ == '__main__':
|
||||
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
|
||||
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
|
||||
|
||||
model = torch.load(input_model, map_location="cpu")
|
||||
if os.path.exists(input_model):
|
||||
model = torch.load(input_model, map_location="cpu")
|
||||
else:
|
||||
input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
|
||||
# lazy import load_file only if lora is in safetensors format.
|
||||
from safetensors.torch import load_file
|
||||
model = load_file(input_model, device="cpu")
|
||||
|
||||
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
|
||||
|
||||
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
#!/usr/bin/env python3
|
||||
import torch
|
||||
import os
|
||||
from pprint import pprint
|
||||
import sys
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from pprint import pprint
|
||||
|
||||
import torch
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||
import gguf
|
||||
@@ -69,7 +71,7 @@ def main():
|
||||
persimmon_model = torch.load(args.ckpt_path)
|
||||
hparams = persimmon_model['args']
|
||||
pprint(hparams)
|
||||
tensors = {}
|
||||
tensors: dict[str, torch.Tensor] = {}
|
||||
_flatten_dict(persimmon_model['model'], tensors, None)
|
||||
|
||||
arch = gguf.MODEL_ARCH.PERSIMMON
|
||||
|
||||
493
convert.py
@@ -19,11 +19,10 @@ import sys
|
||||
import time
|
||||
import zipfile
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from collections import OrderedDict
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, TypeVar, cast
|
||||
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
|
||||
|
||||
import numpy as np
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
@@ -328,139 +327,229 @@ class Params:
|
||||
return params
|
||||
|
||||
|
||||
class VocabLoader:
|
||||
def __init__(self, params: Params, fname_tokenizer: Path) -> None:
|
||||
#
|
||||
# vocab
|
||||
#
|
||||
|
||||
class BpeVocab:
|
||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
||||
try:
|
||||
self.vocab = self.bpe_tokenizer["model"]["vocab"]
|
||||
except KeyError:
|
||||
self.vocab = self.bpe_tokenizer
|
||||
added_tokens: dict[str, int]
|
||||
if fname_added_tokens is not None:
|
||||
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
||||
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
||||
else:
|
||||
# Fall back to trying to find the added tokens in tokenizer.json
|
||||
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
|
||||
if not tokenizer_json_file.is_file():
|
||||
added_tokens = {}
|
||||
else:
|
||||
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
|
||||
added_tokens = dict(
|
||||
(item['content'], item['id'])
|
||||
for item in tokenizer_json.get('added_tokens', [])
|
||||
# Added tokens here can be duplicates of the main vocabulary.
|
||||
if item['content'] not in self.bpe_tokenizer)
|
||||
|
||||
vocab_size: int = len(self.vocab)
|
||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||
actual_ids = sorted(added_tokens.values())
|
||||
if expected_ids != actual_ids:
|
||||
expected_end_id = vocab_size + len(actual_ids) - 1
|
||||
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
|
||||
|
||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
||||
self.added_tokens_dict = added_tokens
|
||||
self.added_tokens_list = [text for (text, idx) in items]
|
||||
self.vocab_size_base: int = vocab_size
|
||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
||||
self.fname_tokenizer = fname_tokenizer
|
||||
self.fname_added_tokens = fname_added_tokens
|
||||
|
||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
|
||||
|
||||
for i, _ in enumerate(self.vocab):
|
||||
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
|
||||
|
||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
for text in self.added_tokens_list:
|
||||
score = -1000.0
|
||||
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
|
||||
|
||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
yield from self.bpe_tokens()
|
||||
yield from self.added_tokens()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||
|
||||
|
||||
class SentencePieceVocab:
|
||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
||||
added_tokens: dict[str, int]
|
||||
if fname_added_tokens is not None:
|
||||
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
||||
else:
|
||||
added_tokens = {}
|
||||
|
||||
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
||||
|
||||
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
||||
actual_new_ids = sorted(new_tokens.keys())
|
||||
|
||||
if expected_new_ids != actual_new_ids:
|
||||
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
||||
|
||||
# Token pieces that were added to the base vocabulary.
|
||||
self.added_tokens_dict = added_tokens
|
||||
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
||||
self.vocab_size_base = vocab_size
|
||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||
self.fname_tokenizer = fname_tokenizer
|
||||
self.fname_added_tokens = fname_added_tokens
|
||||
|
||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
tokenizer = self.sentencepiece_tokenizer
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
text: bytes = piece.encode("utf-8")
|
||||
score: float = tokenizer.get_score(i)
|
||||
|
||||
toktype = gguf.TokenType.NORMAL
|
||||
if tokenizer.is_unknown(i):
|
||||
toktype = gguf.TokenType.UNKNOWN
|
||||
if tokenizer.is_control(i):
|
||||
toktype = gguf.TokenType.CONTROL
|
||||
|
||||
# NOTE: I think added_tokens are user defined.
|
||||
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
||||
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
|
||||
|
||||
if tokenizer.is_unused(i):
|
||||
toktype = gguf.TokenType.UNUSED
|
||||
if tokenizer.is_byte(i):
|
||||
toktype = gguf.TokenType.BYTE
|
||||
|
||||
yield text, score, toktype
|
||||
|
||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
for text in self.added_tokens_list:
|
||||
score = -1000.0
|
||||
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
||||
|
||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
yield from self.sentencepiece_tokens()
|
||||
yield from self.added_tokens()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||
|
||||
|
||||
class HfVocab:
|
||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
|
||||
try:
|
||||
from transformers import AutoTokenizer
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"To use VocabLoader, please install the `transformers` package. "
|
||||
"To use HfVocab, please install the `transformers` package. "
|
||||
"You can install it with `pip install transformers`."
|
||||
) from e
|
||||
|
||||
try:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True)
|
||||
except ValueError:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
|
||||
print("fname_tokenizer:", fname_tokenizer)
|
||||
# Allow the tokenizer to default to slow or fast versions.
|
||||
# Explicitly set tokenizer to use local paths.
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
fname_tokenizer,
|
||||
cache_dir=fname_tokenizer,
|
||||
local_files_only=True,
|
||||
)
|
||||
|
||||
self.added_tokens_dict: OrderedDict[str, int] = OrderedDict()
|
||||
# Initialize lists and dictionaries for added tokens
|
||||
self.added_tokens_list = []
|
||||
self.added_tokens_dict = dict()
|
||||
self.added_tokens_ids = set()
|
||||
|
||||
for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
|
||||
if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
|
||||
continue
|
||||
# Process added tokens
|
||||
for tok, tokidx in sorted(
|
||||
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
|
||||
):
|
||||
# Only consider added tokens that are not in the base vocabulary
|
||||
if tokidx >= self.tokenizer.vocab_size:
|
||||
self.added_tokens_list.append(tok)
|
||||
self.added_tokens_dict[tok] = tokidx
|
||||
self.added_tokens_ids.add(tokidx)
|
||||
|
||||
self.added_tokens_dict[tok] = tokidx
|
||||
|
||||
self.unk_token_id: int = self.tokenizer.unk_token_id
|
||||
self.specials: dict[str, int] = {
|
||||
# Store special tokens and their IDs
|
||||
self.specials = {
|
||||
tok: self.tokenizer.get_vocab()[tok]
|
||||
for tok in self.tokenizer.all_special_tokens
|
||||
}
|
||||
self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
|
||||
self.reverse_vocab = {id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
|
||||
self.vocab_size_base: int = self.tokenizer.vocab_size
|
||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
|
||||
self.fname_tokenizer: Path = fname_tokenizer
|
||||
self.special_ids = set(self.tokenizer.all_special_ids)
|
||||
|
||||
vocab_file = "tokenizer.model"
|
||||
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
|
||||
if path_candidate is not None:
|
||||
self.spm = SentencePieceProcessor(str(path_candidate))
|
||||
print(self.spm.vocab_size(), self.vocab_size_base)
|
||||
else:
|
||||
self.spm = None
|
||||
# Set vocabulary sizes
|
||||
self.vocab_size_base = self.tokenizer.vocab_size
|
||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||
|
||||
self.fname_tokenizer = fname_tokenizer
|
||||
self.fname_added_tokens = fname_added_tokens
|
||||
|
||||
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
added_tokens_ids = set(self.added_tokens_dict.values())
|
||||
reverse_vocab = {
|
||||
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
|
||||
}
|
||||
|
||||
for i in range(self.vocab_size_base):
|
||||
if i in added_tokens_ids:
|
||||
for token_id in range(self.vocab_size_base):
|
||||
# Skip processing added tokens here
|
||||
if token_id in self.added_tokens_ids:
|
||||
continue
|
||||
|
||||
text = self.reverse_vocab[i].encode("utf-8")
|
||||
yield text, self.get_token_score(i), self.get_token_type(i)
|
||||
# Convert token text to bytes
|
||||
token_text = reverse_vocab[token_id].encode("utf-8")
|
||||
|
||||
def get_token_type(self, token_id: int) -> gguf.TokenType:
|
||||
toktype = gguf.TokenType.NORMAL
|
||||
# Yield token text, score, and type
|
||||
yield token_text, self.get_token_score(token_id), self.get_token_type(
|
||||
token_id, self.special_ids # Reuse already stored special IDs
|
||||
)
|
||||
|
||||
if self.spm is not None and token_id < self.spm.vocab_size():
|
||||
if self.spm.is_unknown(token_id):
|
||||
toktype = gguf.TokenType.UNKNOWN
|
||||
if self.spm.is_control(token_id):
|
||||
toktype = gguf.TokenType.CONTROL
|
||||
if self.spm.is_unused(token_id):
|
||||
toktype = gguf.TokenType.UNUSED
|
||||
if self.spm.is_byte(token_id):
|
||||
toktype = gguf.TokenType.BYTE
|
||||
else:
|
||||
token = self.reverse_vocab[token_id]
|
||||
if token_id == self.unk_token_id:
|
||||
toktype = gguf.TokenType.UNKNOWN
|
||||
elif token_id in self.special_ids:
|
||||
toktype = gguf.TokenType.CONTROL
|
||||
elif len(token) == 6 and token.startswith("<0x") and token.endswith(">"):
|
||||
toktype = gguf.TokenType.BYTE
|
||||
|
||||
return toktype
|
||||
def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
|
||||
# Determine token type based on whether it's a special token
|
||||
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
|
||||
|
||||
def get_token_score(self, token_id: int) -> float:
|
||||
if self.spm is not None and token_id < self.spm.vocab_size():
|
||||
return cast(float, self.spm.get_score(token_id))
|
||||
return 0.0
|
||||
# Placeholder for actual logic to determine the token's score
|
||||
# This needs to be implemented based on specific requirements
|
||||
return -1000.0 # Default score
|
||||
|
||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
|
||||
for text in self.added_tokens_dict:
|
||||
for text in self.added_tokens_list:
|
||||
if text in self.specials:
|
||||
|
||||
toktype = self.get_token_type(self.specials[text])
|
||||
toktype = self.get_token_type(self.specials[text], self.special_ids)
|
||||
score = self.get_token_score(self.specials[text])
|
||||
|
||||
else:
|
||||
toktype = gguf.TokenType.USER_DEFINED
|
||||
score = -1000.0
|
||||
|
||||
yield text.encode("utf-8"), score, toktype
|
||||
|
||||
def has_newline_token(self) -> bool:
|
||||
return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
|
||||
def has_newline_token(self):
|
||||
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
|
||||
|
||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
yield from self.hf_tokens()
|
||||
yield from self.added_tokens()
|
||||
|
||||
def get_vocab_type(self) -> str:
|
||||
path_candidates = []
|
||||
vocab_file = "tokenizer.model"
|
||||
path_candidates.append(vocab_file)
|
||||
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
|
||||
if path_candidate is not None:
|
||||
return "llama"
|
||||
|
||||
vocab_file = "vocab.json"
|
||||
path_candidates.append(vocab_file)
|
||||
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
|
||||
if path_candidate is not None:
|
||||
return "gpt2"
|
||||
|
||||
vocab_file = "tokenizer.json"
|
||||
path_candidates.append(vocab_file)
|
||||
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
|
||||
if path_candidate:
|
||||
if not self.has_newline_token():
|
||||
return "gpt2"
|
||||
return "llama"
|
||||
|
||||
raise FileNotFoundError(
|
||||
f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; "
|
||||
"if it's in another directory, pass the directory as --vocab-dir"
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_dict)} added tokens>"
|
||||
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||
|
||||
|
||||
Vocab: TypeAlias = 'VocabLoader'
|
||||
Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
|
||||
|
||||
|
||||
#
|
||||
@@ -636,7 +725,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
|
||||
else:
|
||||
model = merge_sharded([mp.model for mp in models_plus])
|
||||
|
||||
return ModelPlus(model, paths, format, vocab)
|
||||
return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types
|
||||
|
||||
|
||||
def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
|
||||
@@ -817,7 +906,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
||||
executor_class = ProcessPoolExecutor
|
||||
else:
|
||||
executor_class = ThreadPoolExecutor
|
||||
with executor_class(max_workers = max_workers) as executor:
|
||||
with executor_class(max_workers=max_workers) as executor:
|
||||
futures: list[concurrent.futures.Future[Out]] = []
|
||||
done = False
|
||||
for _ in range(concurrency):
|
||||
@@ -839,27 +928,35 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
||||
|
||||
|
||||
def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
|
||||
if params.n_vocab != vocab.vocab_size:
|
||||
if params.n_vocab == vocab.vocab_size:
|
||||
print("Ignoring added_tokens.json since model matches vocab size without it.")
|
||||
vocab.added_tokens_dict = OrderedDict()
|
||||
vocab.vocab_size = vocab.vocab_size
|
||||
return
|
||||
# Handle special case where the model's vocab size is not set
|
||||
if params.n_vocab == -1:
|
||||
raise ValueError(
|
||||
f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?"
|
||||
)
|
||||
|
||||
if pad_vocab and params.n_vocab > vocab.vocab_size:
|
||||
pad_count = params.n_vocab - vocab.vocab_size
|
||||
print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
|
||||
for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
|
||||
vocab.added_tokens_dict[f'<dummy{i:05}>'] = -1
|
||||
vocab.vocab_size = params.n_vocab
|
||||
return
|
||||
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
|
||||
msg += f" has {vocab.vocab_size})."
|
||||
if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
|
||||
msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
|
||||
if vocab.vocab_size < params.n_vocab:
|
||||
msg += " Possibly try using the --padvocab option."
|
||||
raise Exception(msg)
|
||||
# Check for a vocab size mismatch
|
||||
if params.n_vocab == vocab.vocab_size:
|
||||
print("Ignoring added_tokens.json since model matches vocab size without it.")
|
||||
return
|
||||
|
||||
if pad_vocab and params.n_vocab > vocab.vocab_size:
|
||||
pad_count = params.n_vocab - vocab.vocab_size
|
||||
print(
|
||||
f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
|
||||
)
|
||||
for i in range(1, pad_count + 1):
|
||||
vocab.added_tokens_dict[f"<dummy{i:05}>"] = -1
|
||||
vocab.added_tokens_list.append(f"<dummy{i:05}>")
|
||||
vocab.vocab_size = params.n_vocab
|
||||
return
|
||||
|
||||
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer} has {vocab.vocab_size})."
|
||||
if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
|
||||
msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
|
||||
if vocab.vocab_size < params.n_vocab:
|
||||
msg += " Add the --pad-vocab option and try again."
|
||||
|
||||
raise Exception(msg)
|
||||
|
||||
|
||||
class OutputFile:
|
||||
@@ -912,18 +1009,46 @@ class OutputFile:
|
||||
if params.ftype is not None:
|
||||
self.gguf.add_file_type(params.ftype)
|
||||
|
||||
def add_meta_vocab(self, vocab: Vocab) -> None:
|
||||
def handle_tokenizer_model(self, vocab: Vocab) -> str:
|
||||
# Map the vocab types to the supported tokenizer models
|
||||
tokenizer_model = {
|
||||
SentencePieceVocab: "llama",
|
||||
HfVocab: "llama",
|
||||
BpeVocab: "gpt2",
|
||||
}.get(type(vocab))
|
||||
|
||||
# Block if vocab type is not predefined
|
||||
if tokenizer_model is None:
|
||||
raise ValueError("Unknown vocab type: Not supported")
|
||||
|
||||
return tokenizer_model
|
||||
|
||||
def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
|
||||
# NOTE: `all_tokens` returns the base vocabulary and added tokens
|
||||
for text, score, toktype in vocab.all_tokens():
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
vocab_type = vocab.get_vocab_type()
|
||||
self.gguf.add_tokenizer_model(vocab_type)
|
||||
assert len(tokens) == vocab.vocab_size
|
||||
|
||||
return tokens, scores, toktypes
|
||||
|
||||
def add_meta_vocab(self, vocab: Vocab) -> None:
|
||||
# Handle the tokenizer model
|
||||
tokenizer_model = self.handle_tokenizer_model(vocab)
|
||||
|
||||
# Ensure that tokenizer_model is added to the GGUF model
|
||||
self.gguf.add_tokenizer_model(tokenizer_model)
|
||||
|
||||
# Extract model vocabulary for model conversion
|
||||
tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
|
||||
|
||||
# Add extracted token information for model conversion
|
||||
self.gguf.add_token_list(tokens)
|
||||
self.gguf.add_token_scores(scores)
|
||||
self.gguf.add_token_types(toktypes)
|
||||
@@ -936,7 +1061,7 @@ class OutputFile:
|
||||
raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
|
||||
data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
|
||||
data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
|
||||
self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype = raw_dtype)
|
||||
self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
|
||||
|
||||
def write_meta(self) -> None:
|
||||
self.gguf.write_header_to_file()
|
||||
@@ -951,8 +1076,7 @@ class OutputFile:
|
||||
@staticmethod
|
||||
def write_vocab_only(
|
||||
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||
pad_vocab: bool = False,
|
||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
|
||||
) -> None:
|
||||
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
|
||||
|
||||
@@ -983,11 +1107,10 @@ class OutputFile:
|
||||
@staticmethod
|
||||
def write_all(
|
||||
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||
concurrency: int = DEFAULT_CONCURRENCY,
|
||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||
pad_vocab: bool = False,
|
||||
) -> None:
|
||||
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
|
||||
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
||||
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
|
||||
@@ -1006,7 +1129,10 @@ class OutputFile:
|
||||
# tensor data
|
||||
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
|
||||
if ftype == GGMLFileType.MostlyQ8_0:
|
||||
ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, use_processpool_executor = True)
|
||||
ndarrays = bounded_parallel_map(
|
||||
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
|
||||
use_processpool_executor=True,
|
||||
)
|
||||
else:
|
||||
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
|
||||
|
||||
@@ -1015,7 +1141,9 @@ class OutputFile:
|
||||
elapsed = time.time() - start
|
||||
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
||||
padi = len(str(len(model)))
|
||||
print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
|
||||
print(
|
||||
f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
|
||||
)
|
||||
of.gguf.write_tensor_data(ndarray)
|
||||
|
||||
of.close()
|
||||
@@ -1145,17 +1273,74 @@ def load_some_model(path: Path) -> ModelPlus:
|
||||
return model_plus
|
||||
|
||||
|
||||
def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]:
|
||||
path2 = path / vocab_file
|
||||
# Use `.parent` instead of /.. to handle the symlink case better.
|
||||
path3 = path.parent / vocab_file
|
||||
class VocabFactory:
|
||||
def __init__(self, path: Path):
|
||||
self.path = path
|
||||
self.files: dict[str, Path | None] = {
|
||||
"tokenizer.model": None,
|
||||
"vocab.json": None,
|
||||
"tokenizer.json": None,
|
||||
}
|
||||
self._detect_files()
|
||||
|
||||
if path2.exists():
|
||||
return path2
|
||||
if path3.exists():
|
||||
return path3
|
||||
def _detect_files(self):
|
||||
for file in self.files.keys():
|
||||
file_path = self.path / file
|
||||
parent_file_path = self.path.parent / file
|
||||
if file_path.exists():
|
||||
self.files[file] = file_path
|
||||
elif parent_file_path.exists():
|
||||
self.files[file] = parent_file_path
|
||||
print(f"Found vocab files: {self.files}")
|
||||
|
||||
return None
|
||||
def _select_file(self, vocabtype: str | None) -> Path:
|
||||
if vocabtype in ["spm", "bpe"]:
|
||||
for file_key in self.files.keys():
|
||||
if (file := self.files[file_key]) is not None:
|
||||
return file
|
||||
raise FileNotFoundError(f"{vocabtype} vocab not found.")
|
||||
if vocabtype == "hfft":
|
||||
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
|
||||
return self.path
|
||||
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
|
||||
|
||||
def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
|
||||
load_merges = vocabtype == "bpe"
|
||||
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
|
||||
return gguf.SpecialVocab(
|
||||
model_parent_path,
|
||||
load_merges=load_merges,
|
||||
special_token_types=None, # Predetermined or passed as a parameter
|
||||
n_vocab=n_vocab,
|
||||
)
|
||||
|
||||
def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
|
||||
path = self._select_file(vocabtype)
|
||||
print(f"Loading vocab file '{path}', type '{vocabtype}'")
|
||||
|
||||
added_tokens_path = path.parent / "added_tokens.json"
|
||||
vocab: Vocab
|
||||
if vocabtype == "bpe":
|
||||
vocab = BpeVocab(
|
||||
path, added_tokens_path if added_tokens_path.exists() else None
|
||||
)
|
||||
elif vocabtype == "spm":
|
||||
vocab = SentencePieceVocab(
|
||||
path, added_tokens_path if added_tokens_path.exists() else None
|
||||
)
|
||||
elif vocabtype == "hfft":
|
||||
vocab = HfVocab(
|
||||
path, added_tokens_path if added_tokens_path.exists() else None
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
|
||||
# FIXME: Respect --vocab-dir?
|
||||
special_vocab = self._create_special_vocab(
|
||||
vocab,
|
||||
vocabtype,
|
||||
model_parent_path,
|
||||
)
|
||||
return vocab, special_vocab
|
||||
|
||||
|
||||
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
||||
@@ -1186,6 +1371,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
|
||||
# We currently only support Q8_0 output on little endian systems.
|
||||
output_choices.append("q8_0")
|
||||
vocab_types = ["spm", "bpe", "hfft"]
|
||||
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
|
||||
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
|
||||
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
||||
@@ -1193,17 +1379,18 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
|
||||
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
||||
parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
|
||||
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
||||
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
|
||||
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
||||
parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
|
||||
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
|
||||
parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
|
||||
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
|
||||
|
||||
args = parser.parse_args(args_in)
|
||||
if args.awq_path:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
|
||||
from awq.apply_awq import add_scale_weights
|
||||
from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
|
||||
tmp_model_path = args.model / "weighted_model"
|
||||
if tmp_model_path.is_dir():
|
||||
print(f"{tmp_model_path} exists as a weighted model.")
|
||||
@@ -1228,7 +1415,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
do_dump_model(model_plus)
|
||||
return
|
||||
endianess = gguf.GGUFEndian.LITTLE
|
||||
if args.bigendian:
|
||||
if args.big_endian:
|
||||
endianess = gguf.GGUFEndian.BIG
|
||||
|
||||
params = Params.load(model_plus)
|
||||
@@ -1249,34 +1436,26 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
|
||||
print(f"params = {params}")
|
||||
|
||||
vocab: Vocab
|
||||
model_parent_path = model_plus.paths[0].parent
|
||||
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
|
||||
vocab_factory = VocabFactory(vocab_path)
|
||||
vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path)
|
||||
|
||||
if args.vocab_only:
|
||||
if not args.outfile:
|
||||
raise ValueError("need --outfile if using --vocab-only")
|
||||
# FIXME: Try to respect vocab_dir somehow?
|
||||
vocab = VocabLoader(params, args.vocab_dir or args.model)
|
||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||
load_merges = True,
|
||||
n_vocab = vocab.vocab_size)
|
||||
outfile = args.outfile
|
||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
|
||||
endianess = endianess, pad_vocab = args.padvocab)
|
||||
endianess=endianess, pad_vocab=args.pad_vocab)
|
||||
print(f"Wrote {outfile}")
|
||||
return
|
||||
|
||||
if model_plus.vocab is not None and args.vocab_dir is None:
|
||||
vocab = model_plus.vocab
|
||||
else:
|
||||
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
||||
vocab = VocabLoader(params, vocab_dir)
|
||||
|
||||
# FIXME: Try to respect vocab_dir somehow?
|
||||
print(f"Vocab info: {vocab}")
|
||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||
load_merges = True,
|
||||
n_vocab = vocab.vocab_size)
|
||||
|
||||
print(f"Special vocab info: {special_vocab}")
|
||||
|
||||
model = model_plus.model
|
||||
model = convert_model_names(model, params)
|
||||
ftype = pick_output_type(model, args.outtype)
|
||||
@@ -1287,7 +1466,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
print(f"Writing {outfile}, format {ftype}")
|
||||
|
||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
|
||||
concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
|
||||
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
|
||||
print(f"Wrote {outfile}")
|
||||
|
||||
|
||||
|
||||
@@ -23,6 +23,9 @@ else()
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(llava)
|
||||
if (LLAMA_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
endif()
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(tokenize)
|
||||
add_subdirectory(parallel)
|
||||
@@ -36,9 +39,7 @@ else()
|
||||
add_subdirectory(lookahead)
|
||||
add_subdirectory(lookup)
|
||||
add_subdirectory(train-text-from-scratch)
|
||||
if (LLAMA_METAL)
|
||||
add_subdirectory(metal)
|
||||
endif()
|
||||
add_subdirectory(imatrix)
|
||||
if (LLAMA_BUILD_SERVER)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
|
||||
@@ -88,7 +88,10 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
const std::vector<float> t_split (LLAMA_MAX_DEVICES, 0.0f);
|
||||
|
||||
model_params.n_gpu_layers = n_gpu_layers;
|
||||
model_params.tensor_split = t_split.data();
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||
|
||||
|
||||
@@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
|
||||
// Set up a the benchmark matrices
|
||||
// printf("Creating new tensor q11 & Running quantize\n");
|
||||
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
||||
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
|
||||
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
|
||||
|
||||
// Set up a the compute graph
|
||||
// printf("Creating new tensor q31\n");
|
||||
@@ -207,7 +207,7 @@ int main(int argc, char ** argv) {
|
||||
// Set up a second graph computation to make sure we override the CPU cache lines
|
||||
// printf("Creating new tensor q12 & Running quantize\n");
|
||||
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
||||
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
|
||||
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
|
||||
|
||||
// printf("Creating new tensor q32\n");
|
||||
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
|
||||
|
||||
@@ -245,9 +245,8 @@ static struct lora_data * load_lora(struct lora_info * info) {
|
||||
params_ggml.no_alloc = true;
|
||||
result->ctx = ggml_init(params_ggml);
|
||||
|
||||
uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
|
||||
uint32_t magic = file.read_u32();
|
||||
if (magic != LLAMA_FILE_MAGIC_LORA) {
|
||||
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
||||
die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
|
||||
}
|
||||
uint32_t version = file.read_u32();
|
||||
|
||||
@@ -1138,9 +1138,8 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
|
||||
return tn_buf.data();
|
||||
};
|
||||
|
||||
uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
|
||||
// write_magic
|
||||
file.write_u32(LLAMA_FILE_MAGIC_LORA); // magic
|
||||
file.write_u32(LLAMA_FILE_MAGIC_GGLA); // magic
|
||||
file.write_u32(1); // version
|
||||
// write_hparams
|
||||
file.write_u32(lora->hparams.lora_r);
|
||||
@@ -1800,7 +1799,9 @@ int main(int argc, char ** argv) {
|
||||
std::vector<llama_token> train_tokens;
|
||||
std::vector<size_t> train_samples_begin;
|
||||
std::vector<size_t> train_samples_size;
|
||||
printf("%s: tokenize training data\n", __func__);
|
||||
printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
|
||||
printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str());
|
||||
printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false");
|
||||
tokenize_file(lctx,
|
||||
params.common.fn_train_data,
|
||||
params.common.sample_start,
|
||||
|
||||
5
examples/imatrix/CMakeLists.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
set(TARGET imatrix)
|
||||
add_executable(${TARGET} imatrix.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
32
examples/imatrix/README.md
Normal file
@@ -0,0 +1,32 @@
|
||||
# llama.cpp/examples/imatrix
|
||||
|
||||
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
|
||||
More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
|
||||
|
||||
## Usage
|
||||
|
||||
```
|
||||
./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>]
|
||||
[-ofreq num_chunks] [-ow <0 or 1>] [other common params]
|
||||
```
|
||||
|
||||
Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
|
||||
The parameters in square brackets are optional and have the following meaning:
|
||||
* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
|
||||
* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
|
||||
* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
|
||||
* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
|
||||
|
||||
For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
||||
|
||||
## Example
|
||||
|
||||
```bash
|
||||
LLAMA_CUBLAS=1 make -j
|
||||
|
||||
# generate importance matrix (imatrix.dat)
|
||||
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
||||
|
||||
# use the imatrix to perform a Q4_K_M quantization
|
||||
./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
|
||||
```
|
||||
513
examples/imatrix/imatrix.cpp
Normal file
@@ -0,0 +1,513 @@
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <sstream>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <unordered_map>
|
||||
#include <algorithm>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
struct Stats {
|
||||
std::vector<float> values;
|
||||
int ncall = 0;
|
||||
};
|
||||
|
||||
struct StatParams {
|
||||
std::string ofile = "imatrix.dat";
|
||||
int n_output_frequency = 10;
|
||||
int verbosity = 1;
|
||||
int keep_every = 0;
|
||||
bool collect_output_weight = false;
|
||||
};
|
||||
|
||||
class IMatrixCollector {
|
||||
public:
|
||||
IMatrixCollector() = default;
|
||||
void set_parameters(StatParams&& params) { m_params = std::move(params); }
|
||||
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
void save_imatrix() const;
|
||||
private:
|
||||
std::unordered_map<std::string, Stats> m_stats;
|
||||
StatParams m_params;
|
||||
std::mutex m_mutex;
|
||||
int m_last_call = 0;
|
||||
std::vector<float> m_src1_data;
|
||||
std::vector<int> m_ids; // the expert ids from ggml_mul_mat_id
|
||||
//
|
||||
void save_imatrix(const char * file_name) const;
|
||||
void keep_imatrix(int ncall) const;
|
||||
};
|
||||
|
||||
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
GGML_UNUSED(user_data);
|
||||
|
||||
const struct ggml_tensor * src0 = t->src[0];
|
||||
const struct ggml_tensor * src1 = t->src[1];
|
||||
|
||||
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
|
||||
if (ask) {
|
||||
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
|
||||
if (t->op != GGML_OP_MUL_MAT) return false;
|
||||
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
|
||||
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
// copy the data from the GPU memory if needed
|
||||
const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
|
||||
|
||||
if (!is_host) {
|
||||
m_src1_data.resize(ggml_nelements(src1));
|
||||
ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
|
||||
}
|
||||
|
||||
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
|
||||
|
||||
if (t->op == GGML_OP_MUL_MAT_ID) {
|
||||
const int idx = ((int32_t *) t->op_params)[0];
|
||||
const int n_as = ((int32_t *) t->op_params)[1];
|
||||
|
||||
// the top-k selected expert ids are stored in the src0 tensor
|
||||
// for simplicity, always copy src0 to host, because it is small
|
||||
// take into account that src0 is not contiguous!
|
||||
GGML_ASSERT(src0->ne[1] == src1->ne[1]);
|
||||
GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int)));
|
||||
m_ids.resize(ggml_nbytes(src0)/sizeof(int));
|
||||
ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
|
||||
|
||||
// loop over all possible experts, regardless if they are used or not in the batch
|
||||
// this is necessary to guarantee equal number of "ncall" for each tensor
|
||||
for (int ex = 0; ex < n_as; ++ex) {
|
||||
src0 = t->src[2 + ex];
|
||||
auto& e = m_stats[src0->name];
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(src1->ne[0], 0);
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
|
||||
exit(1); //GGML_ASSERT(false);
|
||||
}
|
||||
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
|
||||
// using the following line, we can correct for that if needed
|
||||
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
|
||||
++e.ncall;
|
||||
if (m_params.verbosity > 1) {
|
||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||
}
|
||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||
const int excur = m_ids[row*n_as + idx];
|
||||
GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
|
||||
if (excur != ex) continue;
|
||||
const float * x = data + row * src1->ne[0];
|
||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||
e.values[j] += x[j]*x[j];
|
||||
}
|
||||
}
|
||||
if (e.ncall > m_last_call) {
|
||||
m_last_call = e.ncall;
|
||||
if (m_last_call % m_params.n_output_frequency == 0) {
|
||||
save_imatrix();
|
||||
}
|
||||
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
|
||||
keep_imatrix(m_last_call);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
auto& e = m_stats[src0->name];
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(src1->ne[0], 0);
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
|
||||
exit(1); //GGML_ASSERT(false);
|
||||
}
|
||||
++e.ncall;
|
||||
if (m_params.verbosity > 1) {
|
||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||
}
|
||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||
const float * x = data + row * src1->ne[0];
|
||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||
e.values[j] += x[j]*x[j];
|
||||
}
|
||||
}
|
||||
if (e.ncall > m_last_call) {
|
||||
m_last_call = e.ncall;
|
||||
if (m_last_call % m_params.n_output_frequency == 0) {
|
||||
save_imatrix();
|
||||
}
|
||||
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
|
||||
keep_imatrix(m_last_call);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void IMatrixCollector::save_imatrix() const {
|
||||
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
|
||||
}
|
||||
|
||||
void IMatrixCollector::keep_imatrix(int ncall) const {
|
||||
auto file_name = m_params.ofile;
|
||||
if (file_name.empty()) file_name = "imatrix.dat";
|
||||
file_name += ".at_";
|
||||
file_name += std::to_string(ncall);
|
||||
save_imatrix(file_name.c_str());
|
||||
}
|
||||
|
||||
void IMatrixCollector::save_imatrix(const char * fname) const {
|
||||
std::ofstream out(fname, std::ios::binary);
|
||||
int n_entries = m_stats.size();
|
||||
out.write((const char*)&n_entries, sizeof(n_entries));
|
||||
for (auto& p : m_stats) {
|
||||
int len = p.first.size();
|
||||
out.write((const char*)&len, sizeof(len));
|
||||
out.write(p.first.c_str(), len);
|
||||
out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
|
||||
int nval = p.second.values.size();
|
||||
out.write((const char*)&nval, sizeof(nval));
|
||||
if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
|
||||
}
|
||||
if (m_params.verbosity > 0) {
|
||||
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
|
||||
}
|
||||
}
|
||||
|
||||
static IMatrixCollector g_collector;
|
||||
|
||||
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
return g_collector.collect_imatrix(t, ask, user_data);
|
||||
}
|
||||
|
||||
|
||||
struct results_log_softmax {
|
||||
double log_softmax;
|
||||
float logit;
|
||||
float prob;
|
||||
};
|
||||
|
||||
static std::vector<float> softmax(const std::vector<float>& logits) {
|
||||
std::vector<float> probs(logits.size());
|
||||
float max_logit = logits[0];
|
||||
for (float v : logits) {
|
||||
max_logit = std::max(max_logit, v);
|
||||
}
|
||||
double sum_exp = 0.0;
|
||||
for (size_t i = 0; i < logits.size(); i++) {
|
||||
// Subtract the maximum logit value from the current logit value for numerical stability
|
||||
const float logit = logits[i] - max_logit;
|
||||
const float exp_logit = expf(logit);
|
||||
sum_exp += exp_logit;
|
||||
probs[i] = exp_logit;
|
||||
}
|
||||
for (size_t i = 0; i < probs.size(); i++) {
|
||||
probs[i] /= sum_exp;
|
||||
}
|
||||
return probs;
|
||||
}
|
||||
|
||||
static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
|
||||
float max_logit = logits[0];
|
||||
for (int i = 1; i < n_vocab; ++i) {
|
||||
max_logit = std::max(max_logit, logits[i]);
|
||||
}
|
||||
double sum_exp = 0.0;
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
sum_exp += expf(logits[i] - max_logit);
|
||||
}
|
||||
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
|
||||
}
|
||||
|
||||
static void process_logits(
|
||||
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
|
||||
double & nll, double & nll2, float * logit_history, float * prob_history
|
||||
) {
|
||||
std::mutex mutex;
|
||||
int counter = 0;
|
||||
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
|
||||
double local_nll = 0;
|
||||
double local_nll2 = 0;
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
int i = counter++;
|
||||
if (i >= n_token) {
|
||||
nll += local_nll; nll2 += local_nll2;
|
||||
break;
|
||||
}
|
||||
lock.unlock();
|
||||
const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
|
||||
const double v = -results.log_softmax;
|
||||
local_nll += v;
|
||||
local_nll2 += v*v;
|
||||
|
||||
logit_history[i] = results.logit;
|
||||
prob_history[i] = results.prob;
|
||||
}
|
||||
};
|
||||
for (auto & w : workers) {
|
||||
w = std::thread(compute);
|
||||
}
|
||||
compute();
|
||||
for (auto & w : workers) {
|
||||
w.join();
|
||||
}
|
||||
}
|
||||
|
||||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) {
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||
|
||||
if (int(tokens.size()) < 2*n_ctx) {
|
||||
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
|
||||
n_ctx);
|
||||
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<float> logit_history;
|
||||
std::vector<float> prob_history;
|
||||
|
||||
if (compute_ppl) {
|
||||
logit_history.resize(tokens.size());
|
||||
prob_history.resize(tokens.size());
|
||||
}
|
||||
|
||||
const int n_chunk_max = tokens.size() / n_ctx;
|
||||
|
||||
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
const int n_batch = params.n_batch;
|
||||
|
||||
int count = 0;
|
||||
double nll = 0.0;
|
||||
double nll2 = 0.0;
|
||||
|
||||
fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
|
||||
|
||||
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
||||
|
||||
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
|
||||
|
||||
std::vector<float> logits;
|
||||
if (compute_ppl && num_batches > 1) {
|
||||
logits.reserve((size_t)n_ctx * n_vocab);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_chunk; ++i) {
|
||||
const int start = i * n_ctx;
|
||||
const int end = start + n_ctx;
|
||||
|
||||
std::vector<float> logits;
|
||||
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
const int batch_size = std::min(end - batch_start, n_batch);
|
||||
|
||||
// save original token and restore it after eval
|
||||
const auto token_org = tokens[batch_start];
|
||||
|
||||
// add BOS token for the first batch of each chunk
|
||||
if (add_bos && j == 0) {
|
||||
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
// restore the original token in case it was set to BOS
|
||||
tokens[batch_start] = token_org;
|
||||
|
||||
if (compute_ppl && num_batches > 1) {
|
||||
const auto * batch_logits = llama_get_logits(ctx);
|
||||
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
||||
}
|
||||
}
|
||||
|
||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
if (i == 0) {
|
||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||
int total_seconds = (int)(t_total * n_chunk);
|
||||
if (total_seconds >= 60*60) {
|
||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
||||
total_seconds = total_seconds % (60*60);
|
||||
}
|
||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
||||
}
|
||||
|
||||
if (compute_ppl) {
|
||||
const int first = n_ctx/2;
|
||||
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
||||
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
||||
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
||||
count += n_ctx - first - 1;
|
||||
|
||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||
fflush(stdout);
|
||||
|
||||
logits.clear();
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
if (compute_ppl) {
|
||||
nll2 /= count;
|
||||
nll /= count;
|
||||
const double ppl = exp(nll);
|
||||
nll2 -= nll * nll;
|
||||
if (nll2 > 0) {
|
||||
nll2 = sqrt(nll2/(count-1));
|
||||
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
||||
} else {
|
||||
printf("Unexpected negative standard deviation of log(prob)\n");
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
|
||||
StatParams sparams;
|
||||
bool compute_ppl = true;
|
||||
std::vector<char*> args;
|
||||
args.push_back(argv[0]);
|
||||
int iarg = 1;
|
||||
for (; iarg < argc-1; ++iarg) {
|
||||
std::string arg{argv[iarg]};
|
||||
if (arg == "-o" || arg == "--output-file") {
|
||||
sparams.ofile = argv[++iarg];
|
||||
}
|
||||
else if (arg == "-ofreq" || arg == "--output-frequency") {
|
||||
sparams.n_output_frequency = std::stoi(argv[++iarg]);
|
||||
}
|
||||
else if (arg == "-ow" || arg == "--output-weight") {
|
||||
sparams.collect_output_weight = std::stoi(argv[++iarg]);
|
||||
}
|
||||
else if (arg == "--verbosity") {
|
||||
sparams.verbosity = std::stoi(argv[++iarg]);
|
||||
} else if (arg == "--no-ppl") {
|
||||
compute_ppl = false;
|
||||
} else if (arg == "--keep-imatrix") {
|
||||
sparams.keep_every = std::stoi(argv[++iarg]);
|
||||
} else {
|
||||
args.push_back(argv[iarg]);
|
||||
}
|
||||
}
|
||||
if (iarg < argc) {
|
||||
std::string arg{argv[iarg]};
|
||||
if (arg == "--no-ppl") {
|
||||
compute_ppl = false;
|
||||
} else {
|
||||
args.push_back(argv[iarg]);
|
||||
}
|
||||
}
|
||||
|
||||
gpt_params params;
|
||||
params.n_batch = 512;
|
||||
if (!gpt_params_parse(args.size(), args.data(), params)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
g_collector.set_parameters(std::move(sparams));
|
||||
|
||||
params.logits_all = true;
|
||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||
|
||||
print_build_info();
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
}
|
||||
|
||||
llama_backend_init(params.numa);
|
||||
|
||||
llama_model_params mparams = llama_model_params_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||
|
||||
// pass the callback to the backend scheduler
|
||||
// it will be executed for each node during the graph computation
|
||||
cparams.cb_eval = ik_collect_imatrix;
|
||||
cparams.cb_eval_user_data = NULL;
|
||||
|
||||
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to create context\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
if (params.n_ctx > n_ctx_train) {
|
||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
__func__, n_ctx_train, params.n_ctx);
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
bool OK = compute_imatrix(ctx, params, compute_ppl);
|
||||
if (!OK) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
g_collector.save_imatrix();
|
||||
|
||||
llama_print_timings(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -241,7 +241,7 @@ int main(int argc, char ** argv) {
|
||||
LOG("add_bos: %d\n", add_bos);
|
||||
|
||||
bool suff_rm_leading_spc = params.escape;
|
||||
if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
|
||||
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
||||
params.input_suffix.erase(0, 1);
|
||||
suff_rm_leading_spc = false;
|
||||
}
|
||||
|
||||
@@ -128,6 +128,25 @@ static std::string get_gpu_info() {
|
||||
// command line params
|
||||
enum output_formats {CSV, JSON, MARKDOWN, SQL};
|
||||
|
||||
static const char * output_format_str(output_formats format) {
|
||||
switch (format) {
|
||||
case CSV: return "csv";
|
||||
case JSON: return "json";
|
||||
case MARKDOWN: return "md";
|
||||
case SQL: return "sql";
|
||||
default: GGML_ASSERT(!"invalid output format");
|
||||
}
|
||||
}
|
||||
|
||||
static const char * split_mode_str(llama_split_mode mode) {
|
||||
switch (mode) {
|
||||
case LLAMA_SPLIT_NONE: return "none";
|
||||
case LLAMA_SPLIT_LAYER: return "layer";
|
||||
case LLAMA_SPLIT_ROW: return "row";
|
||||
default: GGML_ASSERT(!"invalid split mode");
|
||||
}
|
||||
}
|
||||
|
||||
struct cmd_params {
|
||||
std::vector<std::string> model;
|
||||
std::vector<int> n_prompt;
|
||||
@@ -137,7 +156,9 @@ struct cmd_params {
|
||||
std::vector<ggml_type> type_v;
|
||||
std::vector<int> n_threads;
|
||||
std::vector<int> n_gpu_layers;
|
||||
std::vector<llama_split_mode> split_mode;
|
||||
std::vector<int> main_gpu;
|
||||
std::vector<bool> no_kv_offload;
|
||||
std::vector<bool> mul_mat_q;
|
||||
std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
|
||||
int reps;
|
||||
@@ -154,7 +175,9 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* type_v */ {GGML_TYPE_F16},
|
||||
/* n_threads */ {get_num_physical_cores()},
|
||||
/* n_gpu_layers */ {99},
|
||||
/* split_mode */ {LLAMA_SPLIT_LAYER},
|
||||
/* main_gpu */ {0},
|
||||
/* no_kv_offload */ {false},
|
||||
/* mul_mat_q */ {true},
|
||||
/* tensor_split */ {{}},
|
||||
/* reps */ 5,
|
||||
@@ -167,20 +190,22 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf("\n");
|
||||
printf("options:\n");
|
||||
printf(" -h, --help\n");
|
||||
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
||||
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
||||
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
||||
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
||||
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
|
||||
printf(" -ts, --tensor_split <ts0/ts1/..> \n");
|
||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
|
||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
||||
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
||||
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
||||
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
||||
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
|
||||
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
|
||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||
printf("\n");
|
||||
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
||||
}
|
||||
@@ -303,12 +328,41 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
}
|
||||
auto p = split<int>(argv[i], split_delim);
|
||||
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
||||
} else if (arg == "-sm" || arg == "--split-mode") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<std::string>(argv[i], split_delim);
|
||||
std::vector<llama_split_mode> modes;
|
||||
for (const auto & m : p) {
|
||||
llama_split_mode mode;
|
||||
if (m == "none") {
|
||||
mode = LLAMA_SPLIT_NONE;
|
||||
} else if (m == "layer") {
|
||||
mode = LLAMA_SPLIT_LAYER;
|
||||
} else if (m == "row") {
|
||||
mode = LLAMA_SPLIT_ROW;
|
||||
} else {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
modes.push_back(mode);
|
||||
}
|
||||
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
|
||||
} else if (arg == "-mg" || arg == "--main-gpu") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.main_gpu = split<int>(argv[i], split_delim);
|
||||
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<bool>(argv[i], split_delim);
|
||||
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
|
||||
} else if (arg == "-mmq" || arg == "--mul-mat-q") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -382,7 +436,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
|
||||
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
|
||||
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
|
||||
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
|
||||
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
||||
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
||||
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
|
||||
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
|
||||
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
||||
@@ -399,7 +455,9 @@ struct cmd_params_instance {
|
||||
ggml_type type_v;
|
||||
int n_threads;
|
||||
int n_gpu_layers;
|
||||
llama_split_mode split_mode;
|
||||
int main_gpu;
|
||||
bool no_kv_offload;
|
||||
bool mul_mat_q;
|
||||
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
|
||||
|
||||
@@ -407,6 +465,7 @@ struct cmd_params_instance {
|
||||
llama_model_params mparams = llama_model_default_params();
|
||||
|
||||
mparams.n_gpu_layers = n_gpu_layers;
|
||||
mparams.split_mode = split_mode;
|
||||
mparams.main_gpu = main_gpu;
|
||||
mparams.tensor_split = tensor_split.data();
|
||||
|
||||
@@ -416,6 +475,7 @@ struct cmd_params_instance {
|
||||
bool equal_mparams(const cmd_params_instance & other) const {
|
||||
return model == other.model &&
|
||||
n_gpu_layers == other.n_gpu_layers &&
|
||||
split_mode == other.split_mode &&
|
||||
main_gpu == other.main_gpu &&
|
||||
tensor_split == other.tensor_split;
|
||||
}
|
||||
@@ -428,54 +488,26 @@ struct cmd_params_instance {
|
||||
cparams.type_k = type_k;
|
||||
cparams.type_v = type_v;
|
||||
cparams.mul_mat_q = mul_mat_q;
|
||||
cparams.offload_kqv = !no_kv_offload;
|
||||
|
||||
return cparams;
|
||||
}
|
||||
};
|
||||
|
||||
static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) {
|
||||
std::vector<cmd_params_instance> instances;
|
||||
|
||||
for (const auto & m : params.model)
|
||||
for (const auto & nl : params.n_gpu_layers)
|
||||
for (const auto & mg : params.main_gpu)
|
||||
for (const auto & ts : params.tensor_split)
|
||||
for (const auto & nb : params.n_batch)
|
||||
for (const auto & tk : params.type_k)
|
||||
for (const auto & tv : params.type_v)
|
||||
for (const auto & mmq : params.mul_mat_q)
|
||||
for (const auto & nt : params.n_threads) {
|
||||
cmd_params_instance instance = {
|
||||
/* .model = */ m,
|
||||
/* .n_prompt = */ n_prompt,
|
||||
/* .n_gen = */ n_gen,
|
||||
/* .n_batch = */ nb,
|
||||
/* .type_k = */ tk,
|
||||
/* .type_v = */ tv,
|
||||
/* .n_threads = */ nt,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .main_gpu = */ mg,
|
||||
/* .mul_mat_q = */ mmq,
|
||||
/* .tensor_split = */ ts,
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
return instances;
|
||||
}
|
||||
|
||||
static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
|
||||
std::vector<cmd_params_instance> instances;
|
||||
|
||||
#if 1
|
||||
// this ordering minimizes the number of times that each model needs to be reloaded
|
||||
for (const auto & m : params.model)
|
||||
for (const auto & nl : params.n_gpu_layers)
|
||||
for (const auto & sm : params.split_mode)
|
||||
for (const auto & mg : params.main_gpu)
|
||||
for (const auto & ts : params.tensor_split)
|
||||
for (const auto & nb : params.n_batch)
|
||||
for (const auto & tk : params.type_k)
|
||||
for (const auto & tv : params.type_v)
|
||||
for (const auto & mmq : params.mul_mat_q)
|
||||
for (const auto & nkvo : params.no_kv_offload)
|
||||
for (const auto & nt : params.n_threads) {
|
||||
for (const auto & n_prompt : params.n_prompt) {
|
||||
if (n_prompt == 0) {
|
||||
@@ -490,7 +522,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .type_v = */ tv,
|
||||
/* .n_threads = */ nt,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .split_mode = */ sm,
|
||||
/* .main_gpu = */ mg,
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .mul_mat_q = */ mmq,
|
||||
/* .tensor_split = */ ts,
|
||||
};
|
||||
@@ -510,31 +544,15 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .type_v = */ tv,
|
||||
/* .n_threads = */ nt,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .split_mode = */ sm,
|
||||
/* .main_gpu = */ mg,
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .mul_mat_q = */ mmq,
|
||||
/* .tensor_split = */ ts,
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
}
|
||||
#else
|
||||
// this ordering separates the prompt and generation tests
|
||||
for (const auto & n_prompt : params.n_prompt) {
|
||||
if (n_prompt == 0) {
|
||||
continue;
|
||||
}
|
||||
auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt);
|
||||
instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end());
|
||||
}
|
||||
|
||||
for (const auto & n_gen : params.n_gen) {
|
||||
if (n_gen == 0) {
|
||||
continue;
|
||||
}
|
||||
auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
|
||||
instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
|
||||
}
|
||||
#endif
|
||||
|
||||
return instances;
|
||||
}
|
||||
@@ -544,6 +562,7 @@ struct test {
|
||||
static const int build_number;
|
||||
static const bool cuda;
|
||||
static const bool opencl;
|
||||
static const bool vulkan;
|
||||
static const bool metal;
|
||||
static const bool gpu_blas;
|
||||
static const bool blas;
|
||||
@@ -558,7 +577,9 @@ struct test {
|
||||
ggml_type type_k;
|
||||
ggml_type type_v;
|
||||
int n_gpu_layers;
|
||||
llama_split_mode split_mode;
|
||||
int main_gpu;
|
||||
bool no_kv_offload;
|
||||
bool mul_mat_q;
|
||||
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
|
||||
int n_prompt;
|
||||
@@ -578,7 +599,9 @@ struct test {
|
||||
type_k = inst.type_k;
|
||||
type_v = inst.type_v;
|
||||
n_gpu_layers = inst.n_gpu_layers;
|
||||
split_mode = inst.split_mode;
|
||||
main_gpu = inst.main_gpu;
|
||||
no_kv_offload = inst.no_kv_offload;
|
||||
mul_mat_q = inst.mul_mat_q;
|
||||
tensor_split = inst.tensor_split;
|
||||
n_prompt = inst.n_prompt;
|
||||
@@ -621,6 +644,9 @@ struct test {
|
||||
if (opencl) {
|
||||
return "OpenCL";
|
||||
}
|
||||
if (vulkan) {
|
||||
return "Vulkan";
|
||||
}
|
||||
if (metal) {
|
||||
return "Metal";
|
||||
}
|
||||
@@ -636,11 +662,13 @@ struct test {
|
||||
static const std::vector<std::string> & get_fields() {
|
||||
static const std::vector<std::string> fields = {
|
||||
"build_commit", "build_number",
|
||||
"cuda", "opencl", "metal", "gpu_blas", "blas",
|
||||
"cuda", "opencl", "vulkan", "metal", "gpu_blas", "blas",
|
||||
"cpu_info", "gpu_info",
|
||||
"model_filename", "model_type", "model_size", "model_n_params",
|
||||
"n_batch", "n_threads", "type_k", "type_v",
|
||||
"n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
|
||||
"n_gpu_layers", "split_mode",
|
||||
"main_gpu", "no_kv_offload",
|
||||
"mul_mat_q", "tensor_split",
|
||||
"n_prompt", "n_gen", "test_time",
|
||||
"avg_ns", "stddev_ns",
|
||||
"avg_ts", "stddev_ts"
|
||||
@@ -658,8 +686,8 @@ struct test {
|
||||
field == "avg_ns" || field == "stddev_ns") {
|
||||
return INT;
|
||||
}
|
||||
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
|
||||
field == "f16_kv" || field == "mul_mat_q") {
|
||||
if (field == "cuda" || field == "opencl" || field == "vulkan"|| field == "metal" || field == "gpu_blas" || field == "blas" ||
|
||||
field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
|
||||
return BOOL;
|
||||
}
|
||||
if (field == "avg_ts" || field == "stddev_ts") {
|
||||
@@ -686,11 +714,13 @@ struct test {
|
||||
}
|
||||
std::vector<std::string> values = {
|
||||
build_commit, std::to_string(build_number),
|
||||
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
||||
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
||||
cpu_info, gpu_info,
|
||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
|
||||
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
||||
std::to_string(main_gpu), std::to_string(no_kv_offload),
|
||||
std::to_string(mul_mat_q), tensor_split_str,
|
||||
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
||||
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
||||
std::to_string(avg_ts()), std::to_string(stdev_ts())
|
||||
@@ -712,6 +742,7 @@ const std::string test::build_commit = LLAMA_COMMIT;
|
||||
const int test::build_number = LLAMA_BUILD_NUMBER;
|
||||
const bool test::cuda = !!ggml_cpu_has_cublas();
|
||||
const bool test::opencl = !!ggml_cpu_has_clblast();
|
||||
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
||||
const bool test::metal = !!ggml_cpu_has_metal();
|
||||
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
||||
const bool test::blas = !!ggml_cpu_has_blas();
|
||||
@@ -845,12 +876,18 @@ struct markdown_printer : public printer {
|
||||
if (field == "n_gpu_layers") {
|
||||
return "ngl";
|
||||
}
|
||||
if (field == "split_mode") {
|
||||
return "sm";
|
||||
}
|
||||
if (field == "n_threads") {
|
||||
return "threads";
|
||||
}
|
||||
if (field == "mul_mat_q") {
|
||||
return "mmq";
|
||||
}
|
||||
if (field == "no_kv_offload") {
|
||||
return "nkvo";
|
||||
}
|
||||
if (field == "tensor_split") {
|
||||
return "ts";
|
||||
}
|
||||
@@ -882,9 +919,15 @@ struct markdown_printer : public printer {
|
||||
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
|
||||
fields.push_back("main_gpu");
|
||||
}
|
||||
if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
|
||||
fields.push_back("split_mode");
|
||||
}
|
||||
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
|
||||
fields.push_back("mul_mat_q");
|
||||
}
|
||||
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
|
||||
fields.push_back("no_kv_offload");
|
||||
}
|
||||
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
||||
fields.push_back("tensor_split");
|
||||
}
|
||||
|
||||
33
examples/llama.android/.gitignore
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
# Gradle files
|
||||
.gradle/
|
||||
build/
|
||||
|
||||
# Local configuration file (sdk path, etc)
|
||||
local.properties
|
||||
|
||||
# Log/OS Files
|
||||
*.log
|
||||
|
||||
# Android Studio generated files and folders
|
||||
captures/
|
||||
.externalNativeBuild/
|
||||
.cxx/
|
||||
*.apk
|
||||
output.json
|
||||
|
||||
# IntelliJ
|
||||
*.iml
|
||||
.idea/
|
||||
misc.xml
|
||||
deploymentTargetDropDown.xml
|
||||
render.experimental.xml
|
||||
|
||||
# Keystore files
|
||||
*.jks
|
||||
*.keystore
|
||||
|
||||
# Google Services (e.g. APIs or Firebase)
|
||||
google-services.json
|
||||
|
||||
# Android Profiling
|
||||
*.hprof
|
||||
0
examples/llama.android/README.md
Normal file
1
examples/llama.android/app/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
/build
|
||||
92
examples/llama.android/app/build.gradle.kts
Normal file
@@ -0,0 +1,92 @@
|
||||
plugins {
|
||||
id("com.android.application")
|
||||
id("org.jetbrains.kotlin.android")
|
||||
}
|
||||
|
||||
android {
|
||||
namespace = "com.example.llama"
|
||||
compileSdk = 34
|
||||
|
||||
ndkVersion = "26.1.10909125"
|
||||
|
||||
defaultConfig {
|
||||
applicationId = "com.example.llama"
|
||||
minSdk = 33
|
||||
targetSdk = 34
|
||||
versionCode = 1
|
||||
versionName = "1.0"
|
||||
|
||||
testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
|
||||
vectorDrawables {
|
||||
useSupportLibrary = true
|
||||
}
|
||||
ndk {
|
||||
// Workaround for https://github.com/llvm/llvm-project/issues/65820
|
||||
// affecting armeabi-v7a. Skip armeabi-v7a when invoked with
|
||||
// -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a).
|
||||
if (project.hasProperty("skip-armeabi-v7a")) {
|
||||
abiFilters += listOf("arm64-v8a", "x86_64", "x86")
|
||||
}
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||
cppFlags += listOf()
|
||||
arguments += listOf()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buildTypes {
|
||||
release {
|
||||
isMinifyEnabled = false
|
||||
proguardFiles(
|
||||
getDefaultProguardFile("proguard-android-optimize.txt"),
|
||||
"proguard-rules.pro"
|
||||
)
|
||||
}
|
||||
}
|
||||
compileOptions {
|
||||
sourceCompatibility = JavaVersion.VERSION_1_8
|
||||
targetCompatibility = JavaVersion.VERSION_1_8
|
||||
}
|
||||
kotlinOptions {
|
||||
jvmTarget = "1.8"
|
||||
}
|
||||
buildFeatures {
|
||||
compose = true
|
||||
}
|
||||
composeOptions {
|
||||
kotlinCompilerExtensionVersion = "1.5.1"
|
||||
}
|
||||
packaging {
|
||||
resources {
|
||||
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
||||
}
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
path = file("src/main/cpp/CMakeLists.txt")
|
||||
version = "3.22.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation("androidx.core:core-ktx:1.12.0")
|
||||
implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
|
||||
implementation("androidx.activity:activity-compose:1.8.2")
|
||||
implementation(platform("androidx.compose:compose-bom:2023.08.00"))
|
||||
implementation("androidx.compose.ui:ui")
|
||||
implementation("androidx.compose.ui:ui-graphics")
|
||||
implementation("androidx.compose.ui:ui-tooling-preview")
|
||||
implementation("androidx.compose.material3:material3")
|
||||
testImplementation("junit:junit:4.13.2")
|
||||
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||
androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
|
||||
androidTestImplementation("androidx.compose.ui:ui-test-junit4")
|
||||
debugImplementation("androidx.compose.ui:ui-tooling")
|
||||
debugImplementation("androidx.compose.ui:ui-test-manifest")
|
||||
}
|
||||
21
examples/llama.android/app/proguard-rules.pro
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
# Add project specific ProGuard rules here.
|
||||
# You can control the set of applied configuration files using the
|
||||
# proguardFiles setting in build.gradle.
|
||||
#
|
||||
# For more details, see
|
||||
# http://developer.android.com/guide/developing/tools/proguard.html
|
||||
|
||||
# If your project uses WebView with JS, uncomment the following
|
||||
# and specify the fully qualified class name to the JavaScript interface
|
||||
# class:
|
||||
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
|
||||
# public *;
|
||||
#}
|
||||
|
||||
# Uncomment this to preserve the line number information for
|
||||
# debugging stack traces.
|
||||
#-keepattributes SourceFile,LineNumberTable
|
||||
|
||||
# If you keep the line number information, uncomment this to
|
||||
# hide the original source file name.
|
||||
#-renamesourcefileattribute SourceFile
|
||||
30
examples/llama.android/app/src/main/AndroidManifest.xml
Normal file
@@ -0,0 +1,30 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
|
||||
xmlns:tools="http://schemas.android.com/tools">
|
||||
|
||||
<uses-permission android:name="android.permission.INTERNET" />
|
||||
|
||||
<application
|
||||
android:allowBackup="true"
|
||||
android:dataExtractionRules="@xml/data_extraction_rules"
|
||||
android:fullBackupContent="@xml/backup_rules"
|
||||
android:icon="@mipmap/ic_launcher"
|
||||
android:label="@string/app_name"
|
||||
android:roundIcon="@mipmap/ic_launcher_round"
|
||||
android:supportsRtl="true"
|
||||
android:theme="@style/Theme.LlamaAndroid"
|
||||
>
|
||||
|
||||
<activity
|
||||
android:name=".MainActivity"
|
||||
android:exported="true"
|
||||
android:theme="@style/Theme.LlamaAndroid">
|
||||
<intent-filter>
|
||||
<action android:name="android.intent.action.MAIN" />
|
||||
|
||||
<category android:name="android.intent.category.LAUNCHER" />
|
||||
</intent-filter>
|
||||
</activity>
|
||||
</application>
|
||||
|
||||
</manifest>
|
||||
50
examples/llama.android/app/src/main/cpp/CMakeLists.txt
Normal file
@@ -0,0 +1,50 @@
|
||||
|
||||
# For more information about using CMake with Android Studio, read the
|
||||
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
||||
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
||||
|
||||
# Sets the minimum CMake version required for this project.
|
||||
cmake_minimum_required(VERSION 3.22.1)
|
||||
|
||||
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
|
||||
# Since this is the top level CMakeLists.txt, the project name is also accessible
|
||||
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
|
||||
# build script scope).
|
||||
project("llama-android")
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
llama
|
||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
GIT_TAG master
|
||||
)
|
||||
|
||||
# Also provides "common"
|
||||
FetchContent_MakeAvailable(llama)
|
||||
|
||||
# Creates and names a library, sets it as either STATIC
|
||||
# or SHARED, and provides the relative paths to its source code.
|
||||
# You can define multiple libraries, and CMake builds them for you.
|
||||
# Gradle automatically packages shared libraries with your APK.
|
||||
#
|
||||
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
|
||||
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
||||
# is preferred for the same purpose.
|
||||
#
|
||||
# In order to load a library into your app from Java/Kotlin, you must call
|
||||
# System.loadLibrary() and pass the name of the library defined here;
|
||||
# for GameActivity/NativeActivity derived applications, the same library name must be
|
||||
# used in the AndroidManifest.xml file.
|
||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||
llama-android.cpp)
|
||||
|
||||
# Specifies libraries CMake should link to your target library. You
|
||||
# can link libraries from various origins, such as libraries defined in this
|
||||
# build script, prebuilt third-party libraries, or Android system libraries.
|
||||
target_link_libraries(${CMAKE_PROJECT_NAME}
|
||||
# List libraries link to the target library
|
||||
llama
|
||||
common
|
||||
android
|
||||
log)
|
||||
394
examples/llama.android/app/src/main/cpp/llama-android.cpp
Normal file
@@ -0,0 +1,394 @@
|
||||
#include <android/log.h>
|
||||
#include <jni.h>
|
||||
#include <iomanip>
|
||||
#include <math.h>
|
||||
#include <string>
|
||||
#include <unistd.h>
|
||||
#include "llama.h"
|
||||
#include "common/common.h"
|
||||
|
||||
// Write C++ code here.
|
||||
//
|
||||
// Do not forget to dynamically load the C++ library into your application.
|
||||
//
|
||||
// For instance,
|
||||
//
|
||||
// In MainActivity.java:
|
||||
// static {
|
||||
// System.loadLibrary("llama-android");
|
||||
// }
|
||||
//
|
||||
// Or, in MainActivity.kt:
|
||||
// companion object {
|
||||
// init {
|
||||
// System.loadLibrary("llama-android")
|
||||
// }
|
||||
// }
|
||||
|
||||
#define TAG "llama-android.cpp"
|
||||
#define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
|
||||
#define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
|
||||
|
||||
jclass la_int_var;
|
||||
jmethodID la_int_var_value;
|
||||
jmethodID la_int_var_inc;
|
||||
|
||||
static void log_callback(ggml_log_level level, const char * fmt, void * data) {
|
||||
if (level == GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
|
||||
else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
|
||||
else if (level == GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data);
|
||||
else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
auto path_to_model = env->GetStringUTFChars(filename, 0);
|
||||
LOGi("Loading model from %s", path_to_model);
|
||||
|
||||
auto model = llama_load_model_from_file(path_to_model, model_params);
|
||||
env->ReleaseStringUTFChars(filename, path_to_model);
|
||||
|
||||
if (!model) {
|
||||
LOGe("load_model() failed");
|
||||
env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "load_model() failed");
|
||||
return 0;
|
||||
}
|
||||
|
||||
return reinterpret_cast<jlong>(model);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
|
||||
llama_free_model(reinterpret_cast<llama_model *>(model));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||
auto model = reinterpret_cast<llama_model *>(jmodel);
|
||||
|
||||
if (!model) {
|
||||
LOGe("new_context(): model cannot be null");
|
||||
env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "Model cannot be null");
|
||||
return 0;
|
||||
}
|
||||
|
||||
int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2));
|
||||
LOGi("Using %d threads", n_threads);
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx_params.seed = 1234;
|
||||
ctx_params.n_ctx = 2048;
|
||||
ctx_params.n_threads = n_threads;
|
||||
ctx_params.n_threads_batch = n_threads;
|
||||
|
||||
llama_context * context = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
if (!context) {
|
||||
LOGe("llama_new_context_with_model() returned null)");
|
||||
env->ThrowNew(env->FindClass("java/lang/IllegalStateException"),
|
||||
"llama_new_context_with_model() returned null)");
|
||||
return 0;
|
||||
}
|
||||
|
||||
return reinterpret_cast<jlong>(context);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
|
||||
llama_free(reinterpret_cast<llama_context *>(context));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
|
||||
llama_log_set(log_callback, NULL);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_bench_1model(
|
||||
JNIEnv *env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
jlong model_pointer,
|
||||
jlong batch_pointer,
|
||||
jint pp,
|
||||
jint tg,
|
||||
jint pl,
|
||||
jint nr
|
||||
) {
|
||||
auto pp_avg = 0.0;
|
||||
auto tg_avg = 0.0;
|
||||
auto pp_std = 0.0;
|
||||
auto tg_std = 0.0;
|
||||
|
||||
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
||||
const auto model = reinterpret_cast<llama_model *>(model_pointer);
|
||||
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||
|
||||
const int n_ctx = llama_n_ctx(context);
|
||||
|
||||
LOGi("n_ctx = %d", n_ctx);
|
||||
|
||||
int i, j;
|
||||
int nri;
|
||||
for (nri = 0; nri < nr; nri++) {
|
||||
LOGi("Benchmark prompt processing (pp)");
|
||||
|
||||
llama_batch_clear(*batch);
|
||||
|
||||
const int n_tokens = pp;
|
||||
for (i = 0; i < n_tokens; i++) {
|
||||
llama_batch_add(*batch, 0, i, { 0 }, false);
|
||||
}
|
||||
|
||||
batch->logits[batch->n_tokens - 1] = true;
|
||||
llama_kv_cache_clear(context);
|
||||
|
||||
const auto t_pp_start = ggml_time_us();
|
||||
if (llama_decode(context, *batch) != 0) {
|
||||
LOGi("llama_decode() failed during prompt processing");
|
||||
}
|
||||
const auto t_pp_end = ggml_time_us();
|
||||
|
||||
// bench text generation
|
||||
|
||||
LOGi("Benchmark text generation (tg)");
|
||||
|
||||
llama_kv_cache_clear(context);
|
||||
const auto t_tg_start = ggml_time_us();
|
||||
for (i = 0; i < tg; i++) {
|
||||
|
||||
llama_batch_clear(*batch);
|
||||
for (j = 0; j < pl; j++) {
|
||||
llama_batch_add(*batch, 0, i, { j }, true);
|
||||
}
|
||||
|
||||
LOGi("llama_decode() text generation: %d", i);
|
||||
if (llama_decode(context, *batch) != 0) {
|
||||
LOGi("llama_decode() failed during text generation");
|
||||
}
|
||||
}
|
||||
|
||||
const auto t_tg_end = ggml_time_us();
|
||||
|
||||
llama_kv_cache_clear(context);
|
||||
|
||||
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
|
||||
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
|
||||
|
||||
const auto speed_pp = double(pp) / t_pp;
|
||||
const auto speed_tg = double(pl * tg) / t_tg;
|
||||
|
||||
pp_avg += speed_pp;
|
||||
tg_avg += speed_tg;
|
||||
|
||||
pp_std += speed_pp * speed_pp;
|
||||
tg_std += speed_tg * speed_tg;
|
||||
|
||||
LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg);
|
||||
}
|
||||
|
||||
pp_avg /= double(nr);
|
||||
tg_avg /= double(nr);
|
||||
|
||||
if (nr > 1) {
|
||||
pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1));
|
||||
tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1));
|
||||
} else {
|
||||
pp_std = 0;
|
||||
tg_std = 0;
|
||||
}
|
||||
|
||||
char model_desc[128];
|
||||
llama_model_desc(model, model_desc, sizeof(model_desc));
|
||||
|
||||
const auto model_size = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0;
|
||||
const auto model_n_params = double(llama_model_n_params(model)) / 1e9;
|
||||
|
||||
const auto backend = "(Android)"; // TODO: What should this be?
|
||||
|
||||
std::stringstream result;
|
||||
result << std::setprecision(2);
|
||||
result << "| model | size | params | backend | test | t/s |\n";
|
||||
result << "| --- | --- | --- | --- | --- | --- |\n";
|
||||
result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n";
|
||||
result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n";
|
||||
|
||||
return env->NewStringUTF(result.str().c_str());
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||
|
||||
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
|
||||
|
||||
llama_batch *batch = new llama_batch {
|
||||
0,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
};
|
||||
|
||||
if (embd) {
|
||||
batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
||||
} else {
|
||||
batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
|
||||
}
|
||||
|
||||
batch->pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
|
||||
batch->n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
|
||||
batch->seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
|
||||
}
|
||||
batch->logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
|
||||
|
||||
return reinterpret_cast<jlong>(batch);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) {
|
||||
llama_backend_init(numa);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
|
||||
return env->NewStringUTF(llama_print_system_info());
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jint JNICALL
|
||||
Java_com_example_llama_Llm_completion_1init(
|
||||
JNIEnv *env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
jlong batch_pointer,
|
||||
jstring jtext,
|
||||
jint n_len
|
||||
) {
|
||||
|
||||
const auto text = env->GetStringUTFChars(jtext, 0);
|
||||
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
||||
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||
|
||||
const auto tokens_list = llama_tokenize(context, text, 1);
|
||||
|
||||
auto n_ctx = llama_n_ctx(context);
|
||||
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
|
||||
|
||||
LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
|
||||
|
||||
if (n_kv_req > n_ctx) {
|
||||
LOGe("error: n_kv_req > n_ctx, the required KV cache size is not big enough");
|
||||
}
|
||||
|
||||
for (auto id : tokens_list) {
|
||||
LOGi("%s", llama_token_to_piece(context, id).c_str());
|
||||
}
|
||||
|
||||
llama_batch_clear(*batch);
|
||||
|
||||
// evaluate the initial prompt
|
||||
for (auto i = 0; i < tokens_list.size(); i++) {
|
||||
llama_batch_add(*batch, tokens_list[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
// llama_decode will output logits only for the last token of the prompt
|
||||
batch->logits[batch->n_tokens - 1] = true;
|
||||
|
||||
if (llama_decode(context, *batch) != 0) {
|
||||
LOGe("llama_decode() failed");
|
||||
}
|
||||
|
||||
env->ReleaseStringUTFChars(jtext, text);
|
||||
|
||||
return batch->n_tokens;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_completion_1loop(
|
||||
JNIEnv * env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
jlong batch_pointer,
|
||||
jint n_len,
|
||||
jobject intvar_ncur
|
||||
) {
|
||||
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
||||
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||
const auto model = llama_get_model(context);
|
||||
|
||||
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
|
||||
if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
|
||||
if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
|
||||
|
||||
auto n_vocab = llama_n_vocab(model);
|
||||
auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
// sample the most likely token
|
||||
const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
|
||||
|
||||
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
||||
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
|
||||
return env->NewStringUTF("");
|
||||
}
|
||||
|
||||
auto new_token_chars = llama_token_to_piece(context, new_token_id);
|
||||
LOGi("new_token_chars: `%s`", new_token_chars.c_str());
|
||||
auto new_token = env->NewStringUTF(new_token_chars.c_str());
|
||||
|
||||
llama_batch_clear(*batch);
|
||||
llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
|
||||
|
||||
env->CallVoidMethod(intvar_ncur, la_int_var_inc);
|
||||
|
||||
if (llama_decode(context, *batch) != 0) {
|
||||
LOGe("llama_decode() returned null");
|
||||
}
|
||||
|
||||
return new_token;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
package com.example.llama
|
||||
|
||||
import android.app.DownloadManager
|
||||
import android.net.Uri
|
||||
import android.util.Log
|
||||
import androidx.compose.material3.Button
|
||||
import androidx.compose.material3.Text
|
||||
import androidx.compose.runtime.Composable
|
||||
import androidx.compose.runtime.getValue
|
||||
import androidx.compose.runtime.mutableDoubleStateOf
|
||||
import androidx.compose.runtime.mutableStateOf
|
||||
import androidx.compose.runtime.remember
|
||||
import androidx.compose.runtime.rememberCoroutineScope
|
||||
import androidx.compose.runtime.setValue
|
||||
import androidx.core.database.getLongOrNull
|
||||
import androidx.core.net.toUri
|
||||
import kotlinx.coroutines.delay
|
||||
import kotlinx.coroutines.launch
|
||||
import java.io.File
|
||||
|
||||
data class Downloadable(val name: String, val source: Uri, val destination: File) {
|
||||
companion object {
|
||||
@JvmStatic
|
||||
private val tag: String? = this::class.qualifiedName
|
||||
|
||||
sealed interface State
|
||||
data object Ready: State
|
||||
data class Downloading(val id: Long): State
|
||||
data class Downloaded(val downloadable: Downloadable): State
|
||||
data class Error(val message: String): State
|
||||
|
||||
@JvmStatic
|
||||
@Composable
|
||||
fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) {
|
||||
var status: State by remember {
|
||||
mutableStateOf(
|
||||
if (item.destination.exists()) Downloaded(item)
|
||||
else Ready
|
||||
)
|
||||
}
|
||||
var progress by remember { mutableDoubleStateOf(0.0) }
|
||||
|
||||
val coroutineScope = rememberCoroutineScope()
|
||||
|
||||
suspend fun waitForDownload(result: Downloading, item: Downloadable): State {
|
||||
while (true) {
|
||||
val cursor = dm.query(DownloadManager.Query().setFilterById(result.id))
|
||||
|
||||
if (cursor == null) {
|
||||
Log.e(tag, "dm.query() returned null")
|
||||
return Error("dm.query() returned null")
|
||||
}
|
||||
|
||||
if (!cursor.moveToFirst() || cursor.count < 1) {
|
||||
cursor.close()
|
||||
Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?")
|
||||
return Ready
|
||||
}
|
||||
|
||||
val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR)
|
||||
val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES)
|
||||
val sofar = cursor.getLongOrNull(pix) ?: 0
|
||||
val total = cursor.getLongOrNull(tix) ?: 1
|
||||
cursor.close()
|
||||
|
||||
if (sofar == total) {
|
||||
return Downloaded(item)
|
||||
}
|
||||
|
||||
progress = (sofar * 1.0) / total
|
||||
|
||||
delay(1000L)
|
||||
}
|
||||
}
|
||||
|
||||
fun onClick() {
|
||||
when (val s = status) {
|
||||
is Downloaded -> {
|
||||
viewModel.load(item.destination.path)
|
||||
}
|
||||
|
||||
is Downloading -> {
|
||||
coroutineScope.launch {
|
||||
status = waitForDownload(s, item)
|
||||
}
|
||||
}
|
||||
|
||||
else -> {
|
||||
item.destination.delete()
|
||||
|
||||
val request = DownloadManager.Request(item.source).apply {
|
||||
setTitle("Downloading model")
|
||||
setDescription("Downloading model: ${item.name}")
|
||||
setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI)
|
||||
setDestinationUri(item.destination.toUri())
|
||||
}
|
||||
|
||||
viewModel.log("Saving ${item.name} to ${item.destination.path}")
|
||||
Log.i(tag, "Saving ${item.name} to ${item.destination.path}")
|
||||
|
||||
val id = dm.enqueue(request)
|
||||
status = Downloading(id)
|
||||
onClick()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Button(onClick = { onClick() }, enabled = status !is Downloading) {
|
||||
when (status) {
|
||||
is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%")
|
||||
is Downloaded -> Text("Load ${item.name}")
|
||||
is Ready -> Text("Download ${item.name}")
|
||||
is Error -> Text("Download ${item.name}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,172 @@
|
||||
package com.example.llama
|
||||
|
||||
import android.util.Log
|
||||
import kotlinx.coroutines.CoroutineDispatcher
|
||||
import kotlinx.coroutines.asCoroutineDispatcher
|
||||
import kotlinx.coroutines.flow.Flow
|
||||
import kotlinx.coroutines.flow.flow
|
||||
import kotlinx.coroutines.flow.flowOn
|
||||
import kotlinx.coroutines.withContext
|
||||
import java.util.concurrent.Executors
|
||||
import kotlin.concurrent.thread
|
||||
|
||||
class Llm {
|
||||
private val tag: String? = this::class.simpleName
|
||||
|
||||
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
|
||||
|
||||
private val runLoop: CoroutineDispatcher = Executors.newSingleThreadExecutor {
|
||||
thread(start = false, name = "Llm-RunLoop") {
|
||||
Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}")
|
||||
|
||||
// No-op if called more than once.
|
||||
System.loadLibrary("llama-android")
|
||||
|
||||
// Set llama log handler to Android
|
||||
log_to_android()
|
||||
backend_init(false)
|
||||
|
||||
Log.d(tag, system_info())
|
||||
|
||||
it.run()
|
||||
}.apply {
|
||||
uncaughtExceptionHandler = Thread.UncaughtExceptionHandler { _, exception: Throwable ->
|
||||
Log.e(tag, "Unhandled exception", exception)
|
||||
}
|
||||
}
|
||||
}.asCoroutineDispatcher()
|
||||
|
||||
private val nlen: Int = 64
|
||||
|
||||
private external fun log_to_android()
|
||||
private external fun load_model(filename: String): Long
|
||||
private external fun free_model(model: Long)
|
||||
private external fun new_context(model: Long): Long
|
||||
private external fun free_context(context: Long)
|
||||
private external fun backend_init(numa: Boolean)
|
||||
private external fun backend_free()
|
||||
private external fun free_batch(batch: Long)
|
||||
private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
|
||||
private external fun bench_model(
|
||||
context: Long,
|
||||
model: Long,
|
||||
batch: Long,
|
||||
pp: Int,
|
||||
tg: Int,
|
||||
pl: Int,
|
||||
nr: Int
|
||||
): String
|
||||
|
||||
private external fun system_info(): String
|
||||
|
||||
private external fun completion_init(
|
||||
context: Long,
|
||||
batch: Long,
|
||||
text: String,
|
||||
nLen: Int
|
||||
): Int
|
||||
|
||||
private external fun completion_loop(
|
||||
context: Long,
|
||||
batch: Long,
|
||||
nLen: Int,
|
||||
ncur: IntVar
|
||||
): String
|
||||
|
||||
private external fun kv_cache_clear(context: Long)
|
||||
|
||||
suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String {
|
||||
return withContext(runLoop) {
|
||||
when (val state = threadLocalState.get()) {
|
||||
is State.Loaded -> {
|
||||
Log.d(tag, "bench(): $state")
|
||||
bench_model(state.context, state.model, state.batch, pp, tg, pl, nr)
|
||||
}
|
||||
|
||||
else -> throw IllegalStateException("No model loaded")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
suspend fun load(pathToModel: String) {
|
||||
withContext(runLoop) {
|
||||
when (threadLocalState.get()) {
|
||||
is State.Idle -> {
|
||||
val model = load_model(pathToModel)
|
||||
if (model == 0L) throw IllegalStateException("load_model() failed")
|
||||
|
||||
val context = new_context(model)
|
||||
if (context == 0L) throw IllegalStateException("new_context() failed")
|
||||
|
||||
val batch = new_batch(512, 0, 1)
|
||||
if (batch == 0L) throw IllegalStateException("new_batch() failed")
|
||||
|
||||
Log.i(tag, "Loaded model $pathToModel")
|
||||
threadLocalState.set(State.Loaded(model, context, batch))
|
||||
}
|
||||
else -> throw IllegalStateException("Model already loaded")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun send(message: String): Flow<String> = flow {
|
||||
when (val state = threadLocalState.get()) {
|
||||
is State.Loaded -> {
|
||||
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
|
||||
while (ncur.value <= nlen) {
|
||||
val str = completion_loop(state.context, state.batch, nlen, ncur)
|
||||
if (str.isEmpty()) {
|
||||
break
|
||||
}
|
||||
emit(str)
|
||||
}
|
||||
kv_cache_clear(state.context)
|
||||
}
|
||||
else -> {}
|
||||
}
|
||||
}.flowOn(runLoop)
|
||||
|
||||
/**
|
||||
* Unloads the model and frees resources.
|
||||
*
|
||||
* This is a no-op if there's no model loaded.
|
||||
*/
|
||||
suspend fun unload() {
|
||||
withContext(runLoop) {
|
||||
when (val state = threadLocalState.get()) {
|
||||
is State.Loaded -> {
|
||||
free_context(state.context)
|
||||
free_model(state.model)
|
||||
free_batch(state.batch)
|
||||
|
||||
threadLocalState.set(State.Idle)
|
||||
}
|
||||
else -> {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
companion object {
|
||||
private class IntVar(value: Int) {
|
||||
@Volatile
|
||||
var value: Int = value
|
||||
private set
|
||||
|
||||
fun inc() {
|
||||
synchronized(this) {
|
||||
value += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private sealed interface State {
|
||||
data object Idle: State
|
||||
data class Loaded(val model: Long, val context: Long, val batch: Long): State
|
||||
}
|
||||
|
||||
// Enforce only one instance of Llm.
|
||||
private val _instance: Llm = Llm()
|
||||
|
||||
fun instance(): Llm = _instance
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,154 @@
|
||||
package com.example.llama
|
||||
|
||||
import android.app.ActivityManager
|
||||
import android.app.DownloadManager
|
||||
import android.content.ClipData
|
||||
import android.content.ClipboardManager
|
||||
import android.net.Uri
|
||||
import android.os.Bundle
|
||||
import android.os.StrictMode
|
||||
import android.os.StrictMode.VmPolicy
|
||||
import android.text.format.Formatter
|
||||
import androidx.activity.ComponentActivity
|
||||
import androidx.activity.compose.setContent
|
||||
import androidx.activity.viewModels
|
||||
import androidx.compose.foundation.layout.Box
|
||||
import androidx.compose.foundation.layout.Column
|
||||
import androidx.compose.foundation.layout.Row
|
||||
import androidx.compose.foundation.layout.fillMaxSize
|
||||
import androidx.compose.foundation.layout.padding
|
||||
import androidx.compose.foundation.lazy.LazyColumn
|
||||
import androidx.compose.foundation.lazy.items
|
||||
import androidx.compose.foundation.lazy.rememberLazyListState
|
||||
import androidx.compose.material3.Button
|
||||
import androidx.compose.material3.LocalContentColor
|
||||
import androidx.compose.material3.MaterialTheme
|
||||
import androidx.compose.material3.OutlinedTextField
|
||||
import androidx.compose.material3.Surface
|
||||
import androidx.compose.material3.Text
|
||||
import androidx.compose.runtime.Composable
|
||||
import androidx.compose.ui.Modifier
|
||||
import androidx.compose.ui.unit.dp
|
||||
import androidx.core.content.getSystemService
|
||||
import com.example.llama.ui.theme.LlamaAndroidTheme
|
||||
import java.io.File
|
||||
|
||||
class MainActivity(
|
||||
activityManager: ActivityManager? = null,
|
||||
downloadManager: DownloadManager? = null,
|
||||
clipboardManager: ClipboardManager? = null,
|
||||
): ComponentActivity() {
|
||||
private val tag: String? = this::class.simpleName
|
||||
|
||||
private val activityManager by lazy { activityManager ?: getSystemService<ActivityManager>()!! }
|
||||
private val downloadManager by lazy { downloadManager ?: getSystemService<DownloadManager>()!! }
|
||||
private val clipboardManager by lazy { clipboardManager ?: getSystemService<ClipboardManager>()!! }
|
||||
|
||||
private val viewModel: MainViewModel by viewModels()
|
||||
|
||||
// Get a MemoryInfo object for the device's current memory status.
|
||||
private fun availableMemory(): ActivityManager.MemoryInfo {
|
||||
return ActivityManager.MemoryInfo().also { memoryInfo ->
|
||||
activityManager.getMemoryInfo(memoryInfo)
|
||||
}
|
||||
}
|
||||
|
||||
override fun onCreate(savedInstanceState: Bundle?) {
|
||||
super.onCreate(savedInstanceState)
|
||||
|
||||
StrictMode.setVmPolicy(
|
||||
VmPolicy.Builder(StrictMode.getVmPolicy())
|
||||
.detectLeakedClosableObjects()
|
||||
.build()
|
||||
)
|
||||
|
||||
val free = Formatter.formatFileSize(this, availableMemory().availMem)
|
||||
val total = Formatter.formatFileSize(this, availableMemory().totalMem)
|
||||
|
||||
viewModel.log("Current memory: $free / $total")
|
||||
viewModel.log("Downloads directory: ${getExternalFilesDir(null)}")
|
||||
|
||||
val extFilesDir = getExternalFilesDir(null)
|
||||
|
||||
val models = listOf(
|
||||
Downloadable(
|
||||
"Phi-2 7B (Q4_0, 1.6 GiB)",
|
||||
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
|
||||
File(extFilesDir, "phi-2-q4_0.gguf"),
|
||||
),
|
||||
Downloadable(
|
||||
"TinyLlama 1.1B (f16, 2.2 GiB)",
|
||||
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
|
||||
File(extFilesDir, "tinyllama-1.1-f16.gguf"),
|
||||
),
|
||||
Downloadable(
|
||||
"Phi 2 DPO (Q3_K_M, 1.48 GiB)",
|
||||
Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
|
||||
File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
|
||||
),
|
||||
)
|
||||
|
||||
setContent {
|
||||
LlamaAndroidTheme {
|
||||
// A surface container using the 'background' color from the theme
|
||||
Surface(
|
||||
modifier = Modifier.fillMaxSize(),
|
||||
color = MaterialTheme.colorScheme.background
|
||||
) {
|
||||
MainCompose(
|
||||
viewModel,
|
||||
clipboardManager,
|
||||
downloadManager,
|
||||
models,
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Composable
|
||||
fun MainCompose(
|
||||
viewModel: MainViewModel,
|
||||
clipboard: ClipboardManager,
|
||||
dm: DownloadManager,
|
||||
models: List<Downloadable>
|
||||
) {
|
||||
Column {
|
||||
val scrollState = rememberLazyListState()
|
||||
|
||||
Box(modifier = Modifier.weight(1f)) {
|
||||
LazyColumn(state = scrollState) {
|
||||
items(viewModel.messages) {
|
||||
Text(
|
||||
it,
|
||||
style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current),
|
||||
modifier = Modifier.padding(16.dp)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
OutlinedTextField(
|
||||
value = viewModel.message,
|
||||
onValueChange = { viewModel.updateMessage(it) },
|
||||
label = { Text("Message") },
|
||||
)
|
||||
Row {
|
||||
Button({ viewModel.send() }) { Text("Send") }
|
||||
Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") }
|
||||
Button({ viewModel.clear() }) { Text("Clear") }
|
||||
Button({
|
||||
viewModel.messages.joinToString("\n").let {
|
||||
clipboard.setPrimaryClip(ClipData.newPlainText("", it))
|
||||
}
|
||||
}) { Text("Copy") }
|
||||
}
|
||||
|
||||
Column {
|
||||
for (model in models) {
|
||||
Downloadable.Button(viewModel, dm, model)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,104 @@
|
||||
package com.example.llama
|
||||
|
||||
import android.util.Log
|
||||
import androidx.compose.runtime.getValue
|
||||
import androidx.compose.runtime.mutableStateOf
|
||||
import androidx.compose.runtime.setValue
|
||||
import androidx.lifecycle.ViewModel
|
||||
import androidx.lifecycle.viewModelScope
|
||||
import kotlinx.coroutines.flow.catch
|
||||
import kotlinx.coroutines.launch
|
||||
|
||||
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
companion object {
|
||||
@JvmStatic
|
||||
private val NanosPerSecond = 1_000_000_000.0
|
||||
}
|
||||
|
||||
private val tag: String? = this::class.simpleName
|
||||
|
||||
var messages by mutableStateOf(listOf("Initializing..."))
|
||||
private set
|
||||
|
||||
var message by mutableStateOf("")
|
||||
private set
|
||||
|
||||
override fun onCleared() {
|
||||
super.onCleared()
|
||||
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
llm.unload()
|
||||
} catch (exc: IllegalStateException) {
|
||||
messages += exc.message!!
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun send() {
|
||||
val text = message
|
||||
message = ""
|
||||
|
||||
// Add to messages console.
|
||||
messages += text
|
||||
messages += ""
|
||||
|
||||
viewModelScope.launch {
|
||||
llm.send(text)
|
||||
.catch {
|
||||
Log.e(tag, "send() failed", it)
|
||||
messages += it.message!!
|
||||
}
|
||||
.collect { messages = messages.dropLast(1) + (messages.last() + it) }
|
||||
}
|
||||
}
|
||||
|
||||
fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) {
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
val start = System.nanoTime()
|
||||
val warmupResult = llm.bench(pp, tg, pl, nr)
|
||||
val end = System.nanoTime()
|
||||
|
||||
messages += warmupResult
|
||||
|
||||
val warmup = (end - start).toDouble() / NanosPerSecond
|
||||
messages += "Warm up time: $warmup seconds, please wait..."
|
||||
|
||||
if (warmup > 5.0) {
|
||||
messages += "Warm up took too long, aborting benchmark"
|
||||
return@launch
|
||||
}
|
||||
|
||||
messages += llm.bench(512, 128, 1, 3)
|
||||
} catch (exc: IllegalStateException) {
|
||||
Log.e(tag, "bench() failed", exc)
|
||||
messages += exc.message!!
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun load(pathToModel: String) {
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
llm.load(pathToModel)
|
||||
messages += "Loaded $pathToModel"
|
||||
} catch (exc: IllegalStateException) {
|
||||
Log.e(tag, "load() failed", exc)
|
||||
messages += exc.message!!
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun updateMessage(newMessage: String) {
|
||||
message = newMessage
|
||||
}
|
||||
|
||||
fun clear() {
|
||||
messages = listOf()
|
||||
}
|
||||
|
||||
fun log(message: String) {
|
||||
messages += message
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
package com.example.llama.ui.theme
|
||||
|
||||
import androidx.compose.ui.graphics.Color
|
||||
|
||||
val Purple80 = Color(0xFFD0BCFF)
|
||||
val PurpleGrey80 = Color(0xFFCCC2DC)
|
||||
val Pink80 = Color(0xFFEFB8C8)
|
||||
|
||||
val Purple40 = Color(0xFF6650a4)
|
||||
val PurpleGrey40 = Color(0xFF625b71)
|
||||
val Pink40 = Color(0xFF7D5260)
|
||||
@@ -0,0 +1,70 @@
|
||||
package com.example.llama.ui.theme
|
||||
|
||||
import android.app.Activity
|
||||
import android.os.Build
|
||||
import androidx.compose.foundation.isSystemInDarkTheme
|
||||
import androidx.compose.material3.MaterialTheme
|
||||
import androidx.compose.material3.darkColorScheme
|
||||
import androidx.compose.material3.dynamicDarkColorScheme
|
||||
import androidx.compose.material3.dynamicLightColorScheme
|
||||
import androidx.compose.material3.lightColorScheme
|
||||
import androidx.compose.runtime.Composable
|
||||
import androidx.compose.runtime.SideEffect
|
||||
import androidx.compose.ui.graphics.toArgb
|
||||
import androidx.compose.ui.platform.LocalContext
|
||||
import androidx.compose.ui.platform.LocalView
|
||||
import androidx.core.view.WindowCompat
|
||||
|
||||
private val DarkColorScheme = darkColorScheme(
|
||||
primary = Purple80,
|
||||
secondary = PurpleGrey80,
|
||||
tertiary = Pink80
|
||||
)
|
||||
|
||||
private val LightColorScheme = lightColorScheme(
|
||||
primary = Purple40,
|
||||
secondary = PurpleGrey40,
|
||||
tertiary = Pink40
|
||||
|
||||
/* Other default colors to override
|
||||
background = Color(0xFFFFFBFE),
|
||||
surface = Color(0xFFFFFBFE),
|
||||
onPrimary = Color.White,
|
||||
onSecondary = Color.White,
|
||||
onTertiary = Color.White,
|
||||
onBackground = Color(0xFF1C1B1F),
|
||||
onSurface = Color(0xFF1C1B1F),
|
||||
*/
|
||||
)
|
||||
|
||||
@Composable
|
||||
fun LlamaAndroidTheme(
|
||||
darkTheme: Boolean = isSystemInDarkTheme(),
|
||||
// Dynamic color is available on Android 12+
|
||||
dynamicColor: Boolean = true,
|
||||
content: @Composable () -> Unit
|
||||
) {
|
||||
val colorScheme = when {
|
||||
dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> {
|
||||
val context = LocalContext.current
|
||||
if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context)
|
||||
}
|
||||
|
||||
darkTheme -> DarkColorScheme
|
||||
else -> LightColorScheme
|
||||
}
|
||||
val view = LocalView.current
|
||||
if (!view.isInEditMode) {
|
||||
SideEffect {
|
||||
val window = (view.context as Activity).window
|
||||
window.statusBarColor = colorScheme.primary.toArgb()
|
||||
WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
|
||||
}
|
||||
}
|
||||
|
||||
MaterialTheme(
|
||||
colorScheme = colorScheme,
|
||||
typography = Typography,
|
||||
content = content
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
package com.example.llama.ui.theme
|
||||
|
||||
import androidx.compose.material3.Typography
|
||||
import androidx.compose.ui.text.TextStyle
|
||||
import androidx.compose.ui.text.font.FontFamily
|
||||
import androidx.compose.ui.text.font.FontWeight
|
||||
import androidx.compose.ui.unit.sp
|
||||
|
||||
// Set of Material typography styles to start with
|
||||
val Typography = Typography(
|
||||
bodyLarge = TextStyle(
|
||||
fontFamily = FontFamily.Default,
|
||||
fontWeight = FontWeight.Normal,
|
||||
fontSize = 16.sp,
|
||||
lineHeight = 24.sp,
|
||||
letterSpacing = 0.5.sp
|
||||
)
|
||||
/* Other default text styles to override
|
||||
titleLarge = TextStyle(
|
||||
fontFamily = FontFamily.Default,
|
||||
fontWeight = FontWeight.Normal,
|
||||
fontSize = 22.sp,
|
||||
lineHeight = 28.sp,
|
||||
letterSpacing = 0.sp
|
||||
),
|
||||
labelSmall = TextStyle(
|
||||
fontFamily = FontFamily.Default,
|
||||
fontWeight = FontWeight.Medium,
|
||||
fontSize = 11.sp,
|
||||
lineHeight = 16.sp,
|
||||
letterSpacing = 0.5.sp
|
||||
)
|
||||
*/
|
||||
)
|
||||
@@ -0,0 +1,170 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<vector xmlns:android="http://schemas.android.com/apk/res/android"
|
||||
android:width="108dp"
|
||||
android:height="108dp"
|
||||
android:viewportWidth="108"
|
||||
android:viewportHeight="108">
|
||||
<path
|
||||
android:fillColor="#3DDC84"
|
||||
android:pathData="M0,0h108v108h-108z" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M9,0L9,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,0L19,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M29,0L29,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M39,0L39,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M49,0L49,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M59,0L59,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M69,0L69,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M79,0L79,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M89,0L89,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M99,0L99,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,9L108,9"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,19L108,19"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,29L108,29"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,39L108,39"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,49L108,49"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,59L108,59"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,69L108,69"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,79L108,79"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,89L108,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,99L108,99"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,29L89,29"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,39L89,39"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,49L89,49"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,59L89,59"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,69L89,69"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,79L89,79"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M29,19L29,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M39,19L39,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M49,19L49,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M59,19L59,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M69,19L69,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M79,19L79,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
</vector>
|
||||
@@ -0,0 +1,30 @@
|
||||
<vector xmlns:android="http://schemas.android.com/apk/res/android"
|
||||
xmlns:aapt="http://schemas.android.com/aapt"
|
||||
android:width="108dp"
|
||||
android:height="108dp"
|
||||
android:viewportWidth="108"
|
||||
android:viewportHeight="108">
|
||||
<path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
|
||||
<aapt:attr name="android:fillColor">
|
||||
<gradient
|
||||
android:endX="85.84757"
|
||||
android:endY="92.4963"
|
||||
android:startX="42.9492"
|
||||
android:startY="49.59793"
|
||||
android:type="linear">
|
||||
<item
|
||||
android:color="#44000000"
|
||||
android:offset="0.0" />
|
||||
<item
|
||||
android:color="#00000000"
|
||||
android:offset="1.0" />
|
||||
</gradient>
|
||||
</aapt:attr>
|
||||
</path>
|
||||
<path
|
||||
android:fillColor="#FFFFFF"
|
||||
android:fillType="nonZero"
|
||||
android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
|
||||
android:strokeWidth="1"
|
||||
android:strokeColor="#00000000" />
|
||||
</vector>
|
||||
@@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
|
||||
<background android:drawable="@drawable/ic_launcher_background" />
|
||||
<foreground android:drawable="@drawable/ic_launcher_foreground" />
|
||||
<monochrome android:drawable="@drawable/ic_launcher_foreground" />
|
||||
</adaptive-icon>
|
||||
@@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
|
||||
<background android:drawable="@drawable/ic_launcher_background" />
|
||||
<foreground android:drawable="@drawable/ic_launcher_foreground" />
|
||||
<monochrome android:drawable="@drawable/ic_launcher_foreground" />
|
||||
</adaptive-icon>
|
||||
|
After Width: | Height: | Size: 1.4 KiB |
|
After Width: | Height: | Size: 2.8 KiB |
|
After Width: | Height: | Size: 982 B |
|
After Width: | Height: | Size: 1.7 KiB |
|
After Width: | Height: | Size: 1.9 KiB |
|
After Width: | Height: | Size: 3.8 KiB |
|
After Width: | Height: | Size: 2.8 KiB |
|
After Width: | Height: | Size: 5.8 KiB |
|
After Width: | Height: | Size: 3.8 KiB |
|
After Width: | Height: | Size: 7.6 KiB |
10
examples/llama.android/app/src/main/res/values/colors.xml
Normal file
@@ -0,0 +1,10 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<resources>
|
||||
<color name="purple_200">#FFBB86FC</color>
|
||||
<color name="purple_500">#FF6200EE</color>
|
||||
<color name="purple_700">#FF3700B3</color>
|
||||
<color name="teal_200">#FF03DAC5</color>
|
||||
<color name="teal_700">#FF018786</color>
|
||||
<color name="black">#FF000000</color>
|
||||
<color name="white">#FFFFFFFF</color>
|
||||
</resources>
|
||||
@@ -0,0 +1,3 @@
|
||||
<resources>
|
||||
<string name="app_name">LlamaAndroid</string>
|
||||
</resources>
|
||||
@@ -0,0 +1,5 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<resources>
|
||||
|
||||
<style name="Theme.LlamaAndroid" parent="android:Theme.Material.Light.NoActionBar" />
|
||||
</resources>
|
||||
13
examples/llama.android/app/src/main/res/xml/backup_rules.xml
Normal file
@@ -0,0 +1,13 @@
|
||||
<?xml version="1.0" encoding="utf-8"?><!--
|
||||
Sample backup rules file; uncomment and customize as necessary.
|
||||
See https://developer.android.com/guide/topics/data/autobackup
|
||||
for details.
|
||||
Note: This file is ignored for devices older that API 31
|
||||
See https://developer.android.com/about/versions/12/backup-restore
|
||||
-->
|
||||
<full-backup-content>
|
||||
<!--
|
||||
<include domain="sharedpref" path="."/>
|
||||
<exclude domain="sharedpref" path="device.xml"/>
|
||||
-->
|
||||
</full-backup-content>
|
||||
@@ -0,0 +1,19 @@
|
||||
<?xml version="1.0" encoding="utf-8"?><!--
|
||||
Sample data extraction rules file; uncomment and customize as necessary.
|
||||
See https://developer.android.com/about/versions/12/backup-restore#xml-changes
|
||||
for details.
|
||||
-->
|
||||
<data-extraction-rules>
|
||||
<cloud-backup>
|
||||
<!-- TODO: Use <include> and <exclude> to control what is backed up.
|
||||
<include .../>
|
||||
<exclude .../>
|
||||
-->
|
||||
</cloud-backup>
|
||||
<!--
|
||||
<device-transfer>
|
||||
<include .../>
|
||||
<exclude .../>
|
||||
</device-transfer>
|
||||
-->
|
||||
</data-extraction-rules>
|
||||
5
examples/llama.android/build.gradle.kts
Normal file
@@ -0,0 +1,5 @@
|
||||
// Top-level build file where you can add configuration options common to all sub-projects/modules.
|
||||
plugins {
|
||||
id("com.android.application") version "8.2.0" apply false
|
||||
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
|
||||
}
|
||||
23
examples/llama.android/gradle.properties
Normal file
@@ -0,0 +1,23 @@
|
||||
# Project-wide Gradle settings.
|
||||
# IDE (e.g. Android Studio) users:
|
||||
# Gradle settings configured through the IDE *will override*
|
||||
# any settings specified in this file.
|
||||
# For more details on how to configure your build environment visit
|
||||
# http://www.gradle.org/docs/current/userguide/build_environment.html
|
||||
# Specifies the JVM arguments used for the daemon process.
|
||||
# The setting is particularly useful for tweaking memory settings.
|
||||
org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
|
||||
# When configured, Gradle will run in incubating parallel mode.
|
||||
# This option should only be used with decoupled projects. More details, visit
|
||||
# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
|
||||
# org.gradle.parallel=true
|
||||
# AndroidX package structure to make it clearer which packages are bundled with the
|
||||
# Android operating system, and which are packaged with your app's APK
|
||||
# https://developer.android.com/topic/libraries/support-library/androidx-rn
|
||||
android.useAndroidX=true
|
||||
# Kotlin code style for this project: "official" or "obsolete":
|
||||
kotlin.code.style=official
|
||||
# Enables namespacing of each library's R class so that its R class includes only the
|
||||
# resources declared in the library itself and none from the library's dependencies,
|
||||
# thereby reducing the size of the R class for that library
|
||||
android.nonTransitiveRClass=true
|
||||
BIN
examples/llama.android/gradle/wrapper/gradle-wrapper.jar
vendored
Normal file
6
examples/llama.android/gradle/wrapper/gradle-wrapper.properties
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
#Thu Dec 21 14:31:09 AEDT 2023
|
||||
distributionBase=GRADLE_USER_HOME
|
||||
distributionPath=wrapper/dists
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
|
||||
zipStoreBase=GRADLE_USER_HOME
|
||||
zipStorePath=wrapper/dists
|
||||
185
examples/llama.android/gradlew
vendored
Executable file
@@ -0,0 +1,185 @@
|
||||
#!/usr/bin/env sh
|
||||
|
||||
#
|
||||
# Copyright 2015 the original author or authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Gradle start up script for UN*X
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
# Attempt to set APP_HOME
|
||||
# Resolve links: $0 may be a link
|
||||
PRG="$0"
|
||||
# Need this for relative symlinks.
|
||||
while [ -h "$PRG" ] ; do
|
||||
ls=`ls -ld "$PRG"`
|
||||
link=`expr "$ls" : '.*-> \(.*\)$'`
|
||||
if expr "$link" : '/.*' > /dev/null; then
|
||||
PRG="$link"
|
||||
else
|
||||
PRG=`dirname "$PRG"`"/$link"
|
||||
fi
|
||||
done
|
||||
SAVED="`pwd`"
|
||||
cd "`dirname \"$PRG\"`/" >/dev/null
|
||||
APP_HOME="`pwd -P`"
|
||||
cd "$SAVED" >/dev/null
|
||||
|
||||
APP_NAME="Gradle"
|
||||
APP_BASE_NAME=`basename "$0"`
|
||||
|
||||
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
||||
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
|
||||
|
||||
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
||||
MAX_FD="maximum"
|
||||
|
||||
warn () {
|
||||
echo "$*"
|
||||
}
|
||||
|
||||
die () {
|
||||
echo
|
||||
echo "$*"
|
||||
echo
|
||||
exit 1
|
||||
}
|
||||
|
||||
# OS specific support (must be 'true' or 'false').
|
||||
cygwin=false
|
||||
msys=false
|
||||
darwin=false
|
||||
nonstop=false
|
||||
case "`uname`" in
|
||||
CYGWIN* )
|
||||
cygwin=true
|
||||
;;
|
||||
Darwin* )
|
||||
darwin=true
|
||||
;;
|
||||
MINGW* )
|
||||
msys=true
|
||||
;;
|
||||
NONSTOP* )
|
||||
nonstop=true
|
||||
;;
|
||||
esac
|
||||
|
||||
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
|
||||
|
||||
|
||||
# Determine the Java command to use to start the JVM.
|
||||
if [ -n "$JAVA_HOME" ] ; then
|
||||
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
|
||||
# IBM's JDK on AIX uses strange locations for the executables
|
||||
JAVACMD="$JAVA_HOME/jre/sh/java"
|
||||
else
|
||||
JAVACMD="$JAVA_HOME/bin/java"
|
||||
fi
|
||||
if [ ! -x "$JAVACMD" ] ; then
|
||||
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
|
||||
|
||||
Please set the JAVA_HOME variable in your environment to match the
|
||||
location of your Java installation."
|
||||
fi
|
||||
else
|
||||
JAVACMD="java"
|
||||
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
||||
|
||||
Please set the JAVA_HOME variable in your environment to match the
|
||||
location of your Java installation."
|
||||
fi
|
||||
|
||||
# Increase the maximum file descriptors if we can.
|
||||
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
|
||||
MAX_FD_LIMIT=`ulimit -H -n`
|
||||
if [ $? -eq 0 ] ; then
|
||||
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
|
||||
MAX_FD="$MAX_FD_LIMIT"
|
||||
fi
|
||||
ulimit -n $MAX_FD
|
||||
if [ $? -ne 0 ] ; then
|
||||
warn "Could not set maximum file descriptor limit: $MAX_FD"
|
||||
fi
|
||||
else
|
||||
warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
|
||||
fi
|
||||
fi
|
||||
|
||||
# For Darwin, add options to specify how the application appears in the dock
|
||||
if $darwin; then
|
||||
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
|
||||
fi
|
||||
|
||||
# For Cygwin or MSYS, switch paths to Windows format before running java
|
||||
if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
|
||||
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
|
||||
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
|
||||
|
||||
JAVACMD=`cygpath --unix "$JAVACMD"`
|
||||
|
||||
# We build the pattern for arguments to be converted via cygpath
|
||||
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
|
||||
SEP=""
|
||||
for dir in $ROOTDIRSRAW ; do
|
||||
ROOTDIRS="$ROOTDIRS$SEP$dir"
|
||||
SEP="|"
|
||||
done
|
||||
OURCYGPATTERN="(^($ROOTDIRS))"
|
||||
# Add a user-defined pattern to the cygpath arguments
|
||||
if [ "$GRADLE_CYGPATTERN" != "" ] ; then
|
||||
OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
|
||||
fi
|
||||
# Now convert the arguments - kludge to limit ourselves to /bin/sh
|
||||
i=0
|
||||
for arg in "$@" ; do
|
||||
CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
|
||||
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
|
||||
|
||||
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
|
||||
eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
|
||||
else
|
||||
eval `echo args$i`="\"$arg\""
|
||||
fi
|
||||
i=`expr $i + 1`
|
||||
done
|
||||
case $i in
|
||||
0) set -- ;;
|
||||
1) set -- "$args0" ;;
|
||||
2) set -- "$args0" "$args1" ;;
|
||||
3) set -- "$args0" "$args1" "$args2" ;;
|
||||
4) set -- "$args0" "$args1" "$args2" "$args3" ;;
|
||||
5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
|
||||
6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
|
||||
7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
|
||||
8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
|
||||
9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# Escape application args
|
||||
save () {
|
||||
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
|
||||
echo " "
|
||||
}
|
||||
APP_ARGS=`save "$@"`
|
||||
|
||||
# Collect all arguments for the java command, following the shell quoting and substitution rules
|
||||
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
|
||||
|
||||
exec "$JAVACMD" "$@"
|
||||
17
examples/llama.android/settings.gradle.kts
Normal file
@@ -0,0 +1,17 @@
|
||||
pluginManagement {
|
||||
repositories {
|
||||
google()
|
||||
mavenCentral()
|
||||
gradlePluginPortal()
|
||||
}
|
||||
}
|
||||
dependencyResolutionManagement {
|
||||
repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
|
||||
repositories {
|
||||
google()
|
||||
mavenCentral()
|
||||
}
|
||||
}
|
||||
|
||||
rootProject.name = "LlamaAndroid"
|
||||
include(":app")
|
||||
@@ -1,7 +1,12 @@
|
||||
# llama.swiftui
|
||||
# llama.cpp/examples/llama.swiftui
|
||||
|
||||
Local inference of llama.cpp on an iPhone.
|
||||
So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well.
|
||||
Local inference of llama.cpp on an iPhone. This is a sample app that can be used as a starting
|
||||
point for more advanced projects.
|
||||
|
||||
For usage instructions and performance stats, check the following discussion: https://github.com/ggerganov/llama.cpp/discussions/4508
|
||||
|
||||

|
||||
|
||||
Video demonstration:
|
||||
|
||||
https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
/* Begin PBXBuildFile section */
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
|
||||
79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
|
||||
7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
|
||||
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
|
||||
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
|
||||
@@ -22,6 +23,7 @@
|
||||
|
||||
/* Begin PBXFileReference section */
|
||||
549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
|
||||
79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputButton.swift; sourceTree = "<group>"; };
|
||||
7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
|
||||
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
|
||||
@@ -119,6 +121,7 @@
|
||||
7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
|
||||
8A1C83782AC328BD0096AF73 /* ContentView.swift */,
|
||||
F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */,
|
||||
79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */,
|
||||
);
|
||||
path = UI;
|
||||
sourceTree = "<group>";
|
||||
@@ -213,6 +216,7 @@
|
||||
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
|
||||
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
|
||||
7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */,
|
||||
79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
@@ -345,7 +349,7 @@
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_TEAM = STLSG3FG8Q;
|
||||
DEVELOPMENT_TEAM = K5UQJPP73A;
|
||||
ENABLE_PREVIEWS = YES;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
|
||||
@@ -377,7 +381,7 @@
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_TEAM = STLSG3FG8Q;
|
||||
DEVELOPMENT_TEAM = K5UQJPP73A;
|
||||
ENABLE_PREVIEWS = YES;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
|
||||
|
||||
@@ -1,9 +1,19 @@
|
||||
import Foundation
|
||||
|
||||
struct Model: Identifiable {
|
||||
var id = UUID()
|
||||
var name: String
|
||||
var url: String
|
||||
var filename: String
|
||||
var status: String?
|
||||
}
|
||||
|
||||
@MainActor
|
||||
class LlamaState: ObservableObject {
|
||||
@Published var messageLog = ""
|
||||
@Published var cacheCleared = false
|
||||
@Published var downloadedModels: [Model] = []
|
||||
@Published var undownloadedModels: [Model] = []
|
||||
let NS_PER_S = 1_000_000_000.0
|
||||
|
||||
private var llamaContext: LlamaContext?
|
||||
@@ -13,23 +23,102 @@ class LlamaState: ObservableObject {
|
||||
}
|
||||
|
||||
init() {
|
||||
loadModelsFromDisk()
|
||||
loadDefaultModels()
|
||||
}
|
||||
|
||||
private func loadModelsFromDisk() {
|
||||
do {
|
||||
let documentsURL = getDocumentsDirectory()
|
||||
let modelURLs = try FileManager.default.contentsOfDirectory(at: documentsURL, includingPropertiesForKeys: nil, options: [.skipsHiddenFiles, .skipsSubdirectoryDescendants])
|
||||
for modelURL in modelURLs {
|
||||
let modelName = modelURL.deletingPathExtension().lastPathComponent
|
||||
downloadedModels.append(Model(name: modelName, url: "", filename: modelURL.lastPathComponent, status: "downloaded"))
|
||||
}
|
||||
} catch {
|
||||
print("Error loading models from disk: \(error)")
|
||||
}
|
||||
}
|
||||
|
||||
private func loadDefaultModels() {
|
||||
do {
|
||||
try loadModel(modelUrl: defaultModelUrl)
|
||||
} catch {
|
||||
messageLog += "Error!\n"
|
||||
}
|
||||
|
||||
for model in defaultModels {
|
||||
let fileURL = getDocumentsDirectory().appendingPathComponent(model.filename)
|
||||
if FileManager.default.fileExists(atPath: fileURL.path) {
|
||||
|
||||
} else {
|
||||
var undownloadedModel = model
|
||||
undownloadedModel.status = "download"
|
||||
undownloadedModels.append(undownloadedModel)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func getDocumentsDirectory() -> URL {
|
||||
let paths = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)
|
||||
return paths[0]
|
||||
}
|
||||
private let defaultModels: [Model] = [
|
||||
Model(name: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",url: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf", status: "download"),
|
||||
Model(
|
||||
name: "TinyLlama-1.1B Chat (Q8_0, 1.1 GiB)",
|
||||
url: "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q8_0.gguf?download=true",
|
||||
filename: "tinyllama-1.1b-chat-v1.0.Q8_0.gguf", status: "download"
|
||||
),
|
||||
|
||||
Model(
|
||||
name: "TinyLlama-1.1B (F16, 2.2 GiB)",
|
||||
url: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
|
||||
filename: "tinyllama-1.1b-f16.gguf", status: "download"
|
||||
),
|
||||
|
||||
Model(
|
||||
name: "Phi-2.7B (Q4_0, 1.6 GiB)",
|
||||
url: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
|
||||
filename: "phi-2-q4_0.gguf", status: "download"
|
||||
),
|
||||
|
||||
Model(
|
||||
name: "Phi-2.7B (Q8_0, 2.8 GiB)",
|
||||
url: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
|
||||
filename: "phi-2-q8_0.gguf", status: "download"
|
||||
),
|
||||
|
||||
Model(
|
||||
name: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)",
|
||||
url: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
|
||||
filename: "mistral-7b-v0.1.Q4_0.gguf", status: "download"
|
||||
),
|
||||
Model(
|
||||
name: "OpenHermes-2.5-Mistral-7B (Q3_K_M, 3.52 GiB)",
|
||||
url: "https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q3_K_M.gguf?download=true",
|
||||
filename: "openhermes-2.5-mistral-7b.Q3_K_M.gguf", status: "download"
|
||||
)
|
||||
]
|
||||
func loadModel(modelUrl: URL?) throws {
|
||||
if let modelUrl {
|
||||
messageLog += "Loading model...\n"
|
||||
llamaContext = try LlamaContext.create_context(path: modelUrl.path())
|
||||
messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
|
||||
|
||||
// Assuming that the model is successfully loaded, update the downloaded models
|
||||
updateDownloadedModels(modelName: modelUrl.lastPathComponent, status: "downloaded")
|
||||
} else {
|
||||
messageLog += "Load a model from the list below\n"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private func updateDownloadedModels(modelName: String, status: String) {
|
||||
undownloadedModels.removeAll { $0.name == modelName }
|
||||
}
|
||||
|
||||
|
||||
func complete(text: String) async {
|
||||
guard let llamaContext else {
|
||||
return
|
||||
|
||||
@@ -2,115 +2,57 @@ import SwiftUI
|
||||
|
||||
struct ContentView: View {
|
||||
@StateObject var llamaState = LlamaState()
|
||||
|
||||
@State private var multiLineText = ""
|
||||
|
||||
private static func cleanupModelCaches() {
|
||||
// Delete all models (*.gguf)
|
||||
let fileManager = FileManager.default
|
||||
let documentsUrl = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
|
||||
do {
|
||||
let fileURLs = try fileManager.contentsOfDirectory(at: documentsUrl, includingPropertiesForKeys: nil)
|
||||
for fileURL in fileURLs {
|
||||
if fileURL.pathExtension == "gguf" {
|
||||
try fileManager.removeItem(at: fileURL)
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
print("Error while enumerating files \(documentsUrl.path): \(error.localizedDescription)")
|
||||
}
|
||||
}
|
||||
@State private var showingHelp = false // To track if Help Sheet should be shown
|
||||
|
||||
var body: some View {
|
||||
VStack {
|
||||
ScrollView(.vertical, showsIndicators: true) {
|
||||
Text(llamaState.messageLog)
|
||||
.font(.system(size: 12))
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
NavigationView {
|
||||
VStack {
|
||||
ScrollView(.vertical, showsIndicators: true) {
|
||||
Text(llamaState.messageLog)
|
||||
.font(.system(size: 12))
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
.padding()
|
||||
.onTapGesture {
|
||||
UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
|
||||
}
|
||||
}
|
||||
|
||||
TextEditor(text: $multiLineText)
|
||||
.frame(height: 80)
|
||||
.padding()
|
||||
.border(Color.gray, width: 0.5)
|
||||
|
||||
HStack {
|
||||
Button("Send") {
|
||||
sendText()
|
||||
}
|
||||
|
||||
Button("Bench") {
|
||||
bench()
|
||||
}
|
||||
|
||||
Button("Clear") {
|
||||
clear()
|
||||
}
|
||||
|
||||
Button("Copy") {
|
||||
UIPasteboard.general.string = llamaState.messageLog
|
||||
}
|
||||
}
|
||||
.buttonStyle(.bordered)
|
||||
.padding()
|
||||
.onTapGesture {
|
||||
UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
|
||||
}
|
||||
}
|
||||
|
||||
TextEditor(text: $multiLineText)
|
||||
.frame(height: 80)
|
||||
NavigationLink(destination: DrawerView(llamaState: llamaState)) {
|
||||
Text("View Models")
|
||||
}
|
||||
.padding()
|
||||
.border(Color.gray, width: 0.5)
|
||||
|
||||
HStack {
|
||||
Button("Send") {
|
||||
sendText()
|
||||
}
|
||||
|
||||
Button("Bench") {
|
||||
bench()
|
||||
}
|
||||
|
||||
Button("Clear") {
|
||||
clear()
|
||||
}
|
||||
|
||||
Button("Copy") {
|
||||
UIPasteboard.general.string = llamaState.messageLog
|
||||
}
|
||||
}.buttonStyle(.bordered)
|
||||
|
||||
VStack(alignment: .leading) {
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
modelName: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",
|
||||
modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
|
||||
filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
|
||||
)
|
||||
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
modelName: "TinyLlama-1.1B (Q8_0, 1.1 GiB)",
|
||||
modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true",
|
||||
filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf"
|
||||
)
|
||||
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
modelName: "TinyLlama-1.1B (F16, 2.2 GiB)",
|
||||
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
|
||||
filename: "tinyllama-1.1b-f16.gguf"
|
||||
)
|
||||
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
modelName: "Phi-2.7B (Q4_0, 1.6 GiB)",
|
||||
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
|
||||
filename: "phi-2-q4_0.gguf"
|
||||
)
|
||||
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
modelName: "Phi-2.7B (Q8_0, 2.8 GiB)",
|
||||
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
|
||||
filename: "phi-2-q8_0.gguf"
|
||||
)
|
||||
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
modelName: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)",
|
||||
modelUrl: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
|
||||
filename: "mistral-7b-v0.1.Q4_0.gguf"
|
||||
)
|
||||
|
||||
Button("Clear downloaded models") {
|
||||
ContentView.cleanupModelCaches()
|
||||
llamaState.cacheCleared = true
|
||||
}
|
||||
|
||||
LoadCustomButton(llamaState: llamaState)
|
||||
}
|
||||
.padding(.top, 4)
|
||||
.font(.system(size: 12))
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
.padding()
|
||||
.navigationBarTitle("Model Settings", displayMode: .inline)
|
||||
|
||||
}
|
||||
.padding()
|
||||
}
|
||||
|
||||
func sendText() {
|
||||
@@ -131,8 +73,73 @@ struct ContentView: View {
|
||||
await llamaState.clear()
|
||||
}
|
||||
}
|
||||
struct DrawerView: View {
|
||||
|
||||
@ObservedObject var llamaState: LlamaState
|
||||
@State private var showingHelp = false
|
||||
func delete(at offsets: IndexSet) {
|
||||
offsets.forEach { offset in
|
||||
let model = llamaState.downloadedModels[offset]
|
||||
let fileURL = getDocumentsDirectory().appendingPathComponent(model.filename)
|
||||
do {
|
||||
try FileManager.default.removeItem(at: fileURL)
|
||||
} catch {
|
||||
print("Error deleting file: \(error)")
|
||||
}
|
||||
}
|
||||
|
||||
// Remove models from downloadedModels array
|
||||
llamaState.downloadedModels.remove(atOffsets: offsets)
|
||||
}
|
||||
|
||||
func getDocumentsDirectory() -> URL {
|
||||
let paths = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)
|
||||
return paths[0]
|
||||
}
|
||||
var body: some View {
|
||||
List {
|
||||
Section(header: Text("Download Models From Hugging Face")) {
|
||||
HStack {
|
||||
InputButton(llamaState: llamaState)
|
||||
}
|
||||
}
|
||||
Section(header: Text("Downloaded Models")) {
|
||||
ForEach(llamaState.downloadedModels) { model in
|
||||
DownloadButton(llamaState: llamaState, modelName: model.name, modelUrl: model.url, filename: model.filename)
|
||||
}
|
||||
.onDelete(perform: delete)
|
||||
}
|
||||
Section(header: Text("Default Models")) {
|
||||
ForEach(llamaState.undownloadedModels) { model in
|
||||
DownloadButton(llamaState: llamaState, modelName: model.name, modelUrl: model.url, filename: model.filename)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
.listStyle(GroupedListStyle())
|
||||
.navigationBarTitle("Model Settings", displayMode: .inline).toolbar {
|
||||
ToolbarItem(placement: .navigationBarTrailing) {
|
||||
Button("Help") {
|
||||
showingHelp = true
|
||||
}
|
||||
}
|
||||
}.sheet(isPresented: $showingHelp) { // Sheet for help modal
|
||||
VStack(alignment: .leading) {
|
||||
VStack(alignment: .leading) {
|
||||
Text("1. Make sure the model is in GGUF Format")
|
||||
.padding()
|
||||
Text("2. Copy the download link of the quantized model")
|
||||
.padding()
|
||||
}
|
||||
Spacer()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//#Preview {
|
||||
// ContentView()
|
||||
//}
|
||||
struct ContentView_Previews: PreviewProvider {
|
||||
static var previews: some View {
|
||||
ContentView()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,6 +53,8 @@ struct DownloadButton: View {
|
||||
|
||||
llamaState.cacheCleared = false
|
||||
|
||||
let model = Model(name: modelName, url: modelUrl, filename: filename, status: "downloaded")
|
||||
llamaState.downloadedModels.append(model)
|
||||
status = "downloaded"
|
||||
}
|
||||
} catch let err {
|
||||
|
||||
131
examples/llama.swiftui/llama.swiftui/UI/InputButton.swift
Normal file
@@ -0,0 +1,131 @@
|
||||
import SwiftUI
|
||||
|
||||
struct InputButton: View {
|
||||
@ObservedObject var llamaState: LlamaState
|
||||
@State private var inputLink: String = ""
|
||||
@State private var status: String = "download"
|
||||
@State private var filename: String = ""
|
||||
|
||||
@State private var downloadTask: URLSessionDownloadTask?
|
||||
@State private var progress = 0.0
|
||||
@State private var observation: NSKeyValueObservation?
|
||||
|
||||
private static func extractModelInfo(from link: String) -> (modelName: String, filename: String)? {
|
||||
guard let url = URL(string: link),
|
||||
let lastPathComponent = url.lastPathComponent.components(separatedBy: ".").first,
|
||||
let modelName = lastPathComponent.components(separatedBy: "-").dropLast().joined(separator: "-").removingPercentEncoding,
|
||||
let filename = lastPathComponent.removingPercentEncoding else {
|
||||
return nil
|
||||
}
|
||||
|
||||
return (modelName, filename)
|
||||
}
|
||||
|
||||
private static func getFileURL(filename: String) -> URL {
|
||||
FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename)
|
||||
}
|
||||
|
||||
private func download() {
|
||||
guard let extractedInfo = InputButton.extractModelInfo(from: inputLink) else {
|
||||
// Handle invalid link or extraction failure
|
||||
return
|
||||
}
|
||||
|
||||
let (modelName, filename) = extractedInfo
|
||||
self.filename = filename // Set the state variable
|
||||
|
||||
status = "downloading"
|
||||
print("Downloading model \(modelName) from \(inputLink)")
|
||||
guard let url = URL(string: inputLink) else { return }
|
||||
let fileURL = InputButton.getFileURL(filename: filename)
|
||||
|
||||
downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in
|
||||
if let error = error {
|
||||
print("Error: \(error.localizedDescription)")
|
||||
return
|
||||
}
|
||||
|
||||
guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else {
|
||||
print("Server error!")
|
||||
return
|
||||
}
|
||||
|
||||
do {
|
||||
if let temporaryURL = temporaryURL {
|
||||
try FileManager.default.copyItem(at: temporaryURL, to: fileURL)
|
||||
print("Writing to \(filename) completed")
|
||||
|
||||
llamaState.cacheCleared = false
|
||||
|
||||
let model = Model(name: modelName, url: self.inputLink, filename: filename, status: "downloaded")
|
||||
llamaState.downloadedModels.append(model)
|
||||
status = "downloaded"
|
||||
}
|
||||
} catch let err {
|
||||
print("Error: \(err.localizedDescription)")
|
||||
}
|
||||
}
|
||||
|
||||
observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in
|
||||
self.progress = progress.fractionCompleted
|
||||
}
|
||||
|
||||
downloadTask?.resume()
|
||||
}
|
||||
|
||||
var body: some View {
|
||||
VStack {
|
||||
HStack {
|
||||
TextField("Paste Quantized Download Link", text: $inputLink)
|
||||
.textFieldStyle(RoundedBorderTextFieldStyle())
|
||||
|
||||
Button(action: {
|
||||
downloadTask?.cancel()
|
||||
status = "download"
|
||||
}) {
|
||||
Text("Cancel")
|
||||
}
|
||||
}
|
||||
|
||||
if status == "download" {
|
||||
Button(action: download) {
|
||||
Text("Download Custom Model")
|
||||
}
|
||||
} else if status == "downloading" {
|
||||
Button(action: {
|
||||
downloadTask?.cancel()
|
||||
status = "download"
|
||||
}) {
|
||||
Text("Downloading \(Int(progress * 100))%")
|
||||
}
|
||||
} else if status == "downloaded" {
|
||||
Button(action: {
|
||||
let fileURL = InputButton.getFileURL(filename: self.filename)
|
||||
if !FileManager.default.fileExists(atPath: fileURL.path) {
|
||||
download()
|
||||
return
|
||||
}
|
||||
do {
|
||||
try llamaState.loadModel(modelUrl: fileURL)
|
||||
} catch let err {
|
||||
print("Error: \(err.localizedDescription)")
|
||||
}
|
||||
}) {
|
||||
Text("Load Custom Model")
|
||||
}
|
||||
} else {
|
||||
Text("Unknown status")
|
||||
}
|
||||
}
|
||||
.onDisappear() {
|
||||
downloadTask?.cancel()
|
||||
}
|
||||
.onChange(of: llamaState.cacheCleared) { newValue in
|
||||
if newValue {
|
||||
downloadTask?.cancel()
|
||||
let fileURL = InputButton.getFileURL(filename: self.filename)
|
||||
status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6,7 +6,7 @@
|
||||
" Similarly, you could add an insert mode keybind with
|
||||
" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
|
||||
"
|
||||
" g:llama_api_url and g:llama_overrides can be configured in your .vimrc
|
||||
" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
|
||||
" let g:llama_api_url = "192.168.1.10:8080"
|
||||
" llama_overrides can also be set through buffer/window scopes. For instance
|
||||
" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
|
||||
@@ -82,6 +82,9 @@ func llama#doLlamaGen()
|
||||
endif
|
||||
let l:querydata.prompt = join(l:buflines, "\n")
|
||||
let l:curlcommand = copy(s:curlcommand)
|
||||
if exists("g:llama_api_key")
|
||||
call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
|
||||
endif
|
||||
let l:curlcommand[2] = json_encode(l:querydata)
|
||||
let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
|
||||
endfunction
|
||||
|
||||
131
examples/llava/MobileVLM-README.md
Normal file
@@ -0,0 +1,131 @@
|
||||
# MobileVLM
|
||||
|
||||
Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
|
||||
|
||||
for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
|
||||
|
||||
The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
|
||||
|
||||
## Usage
|
||||
Build with cmake or run `make llava-cli` to build it.
|
||||
|
||||
After building, run: `./llava-cli` to see the usage. For example:
|
||||
|
||||
```sh
|
||||
./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
|
||||
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
|
||||
--image path/to/an/image.jpg \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
|
||||
```
|
||||
|
||||
## Model conversion
|
||||
|
||||
- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
|
||||
|
||||
```sh
|
||||
git clone https://huggingface.co/mtgv/MobileVLM-1.7B
|
||||
|
||||
git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||
```
|
||||
|
||||
2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
|
||||
```
|
||||
|
||||
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert-image-encoder-to-gguf \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B \
|
||||
--projector-type ldp
|
||||
```
|
||||
|
||||
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||
|
||||
```sh
|
||||
python ./convert.py path/to/MobileVLM-1.7B
|
||||
```
|
||||
|
||||
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
|
||||
```sh
|
||||
./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
||||
```
|
||||
|
||||
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
|
||||
|
||||
## Android compile and run
|
||||
### compile
|
||||
refer to `examples/llava/android/build_64.sh`
|
||||
```sh
|
||||
mkdir examples/llava/android/build_64
|
||||
cd examples/llava/android/build_64
|
||||
../build_64.sh
|
||||
```
|
||||
### run on Android
|
||||
refer to `android/adb_run.sh`, modify resources' `name` and `path`
|
||||
|
||||
## some result on Android with `Snapdragon 888` chip
|
||||
### case 1
|
||||
**input**
|
||||
```sh
|
||||
/data/local/tmp/llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-t 4 \
|
||||
--image /data/local/tmp/demo.jpg \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
|
||||
```
|
||||
**output**
|
||||
```sh
|
||||
encode_image_with_clip: image encoded in 21148.71 ms by CLIP ( 146.87 ms per image patch)
|
||||
Susan Wise Bauer
|
||||
llama_print_timings: load time = 23574.72 ms
|
||||
llama_print_timings: sample time = 1.24 ms / 6 runs ( 0.21 ms per token, 4850.44 tokens per second)
|
||||
llama_print_timings: prompt eval time = 12460.15 ms / 246 tokens ( 50.65 ms per token, 19.74 tokens per second)
|
||||
llama_print_timings: eval time = 424.86 ms / 6 runs ( 70.81 ms per token, 14.12 tokens per second)
|
||||
llama_print_timings: total time = 34731.93 ms
|
||||
```
|
||||
### case 2
|
||||
**input**
|
||||
```sh
|
||||
/data/local/tmp/llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-t 4 \
|
||||
--image /data/local/tmp/cat.jpeg \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
|
||||
```
|
||||
|
||||
**output**
|
||||
```sh
|
||||
encode_image_with_clip: image encoded in 21149.51 ms by CLIP ( 146.87 ms per image patch)
|
||||
The image depicts a cat sitting in the grass near some tall green plants.
|
||||
llama_print_timings: load time = 23257.32 ms
|
||||
llama_print_timings: sample time = 5.25 ms / 18 runs ( 0.29 ms per token, 3430.53 tokens per second)
|
||||
llama_print_timings: prompt eval time = 11900.73 ms / 232 tokens ( 51.30 ms per token, 19.49 tokens per second)
|
||||
llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 ms per token, 14.07 tokens per second)
|
||||
llama_print_timings: total time = 34570.79 ms
|
||||
```
|
||||
|
||||
## Minor shortcomings
|
||||
The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
|
||||
|
||||
## TODO
|
||||
|
||||
- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
|
||||
- [ ] Optimize LDP projector performance
|
||||
|
||||
- Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
|
||||
- Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
|
||||
- [ ] run MobileVLM on `Jetson Orin`
|
||||
- [ ] Support more model variants, such as `MobileVLM-3B`.
|
||||
|
||||
|
||||
## contributor
|
||||
```sh
|
||||
zhangjidong05, yangyang260, huyiming03, chenxiaotao03
|
||||
```
|
||||
53
examples/llava/android/adb_run.sh
Executable file
@@ -0,0 +1,53 @@
|
||||
#!/bin/bash
|
||||
|
||||
model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
|
||||
projector_name="mmproj-model-f16.gguf"
|
||||
llama_name="ggml-model-q4_k.gguf"
|
||||
img_dir="/Users/cxt/model/llm"
|
||||
img_name="demo.jpg"
|
||||
prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
|
||||
# img_name="cat.jpeg"
|
||||
# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
|
||||
|
||||
program_dir="build_64/bin"
|
||||
binName="llava-cli"
|
||||
n_threads=4
|
||||
|
||||
|
||||
deviceDir="/data/local/tmp"
|
||||
saveDir="output"
|
||||
if [ ! -d ${saveDir} ]; then
|
||||
mkdir ${saveDir}
|
||||
fi
|
||||
|
||||
|
||||
function android_run() {
|
||||
# # copy resource into device
|
||||
# adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
|
||||
# adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
|
||||
adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
|
||||
# copy program into device
|
||||
adb push ${program_dir}/${binName} ${deviceDir}/${binName}
|
||||
adb shell "chmod 0777 ${deviceDir}/${binName}"
|
||||
|
||||
# run
|
||||
adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
|
||||
-m ${deviceDir}/${llama_name} \
|
||||
--mmproj ${deviceDir}/${projector_name} \
|
||||
-t ${n_threads} \
|
||||
--image ${deviceDir}/${img_name} \
|
||||
-p \"${prompt}\" \
|
||||
> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
|
||||
adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
|
||||
-m ${deviceDir}/${llama_name} \
|
||||
--mmproj ${deviceDir}/${projector_name} \
|
||||
-t ${n_threads} \
|
||||
--image ${deviceDir}/${img_name} \
|
||||
-p \"${prompt}\" \
|
||||
>> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1"
|
||||
adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir}
|
||||
}
|
||||
|
||||
android_run
|
||||
|
||||
echo "android_run is Done!"
|
||||
8
examples/llava/android/build_64.sh
Executable file
@@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
cmake ../../../../ \
|
||||
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DANDROID_ABI="arm64-v8a" \
|
||||
-DANDROID_PLATFORM=android-23 $1
|
||||
|
||||
make -j4
|
||||
@@ -2,17 +2,6 @@
|
||||
// so there might be still unnecessary artifacts hanging around
|
||||
// I'll gradually clean and extend it
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <regex>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
|
||||
#include "clip.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
@@ -29,6 +18,19 @@
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb_image.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <regex>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <cinttypes>
|
||||
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap;
|
||||
va_list ap2;
|
||||
@@ -67,6 +69,7 @@ static std::string format(const char * fmt, ...) {
|
||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||
|
||||
//
|
||||
// tensor name constants
|
||||
@@ -89,6 +92,22 @@ static std::string format(const char * fmt, ...) {
|
||||
#define TN_TEXT_PROJ "text_projection.weight"
|
||||
#define TN_VIS_PROJ "visual_projection.weight"
|
||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||
|
||||
|
||||
enum projector_type {
|
||||
PROJECTOR_TYPE_MLP,
|
||||
PROJECTOR_TYPE_MLP_NORM,
|
||||
PROJECTOR_TYPE_LDP,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_MLP, "mlp" },
|
||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// utilities to get data from a gguf file
|
||||
@@ -126,26 +145,94 @@ static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::str
|
||||
}
|
||||
|
||||
static std::string get_ftype(int ftype) {
|
||||
switch (ftype) {
|
||||
case 0:
|
||||
return "f32";
|
||||
case 1:
|
||||
return "f16";
|
||||
case 2:
|
||||
return "q4_0";
|
||||
case 3:
|
||||
return "q4_1";
|
||||
case 6:
|
||||
return "q5_0";
|
||||
case 7:
|
||||
return "q5_1";
|
||||
case 8:
|
||||
return "q8_0";
|
||||
default:
|
||||
throw std::runtime_error(format("%s: Unrecognized file type: %d\n", __func__, ftype));
|
||||
return ggml_type_name(static_cast<ggml_type>(ftype));
|
||||
}
|
||||
|
||||
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
||||
switch (type) {
|
||||
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
||||
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
||||
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
||||
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
||||
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
||||
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
||||
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
||||
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
||||
default: return format("unknown type %d", type);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
std::string result;
|
||||
for (size_t pos = 0; ; pos += search.length()) {
|
||||
auto new_pos = s.find(search, pos);
|
||||
if (new_pos == std::string::npos) {
|
||||
result += s.substr(pos, s.size() - pos);
|
||||
break;
|
||||
}
|
||||
result += s.substr(pos, new_pos - pos) + replace;
|
||||
pos = new_pos;
|
||||
}
|
||||
s = std::move(result);
|
||||
}
|
||||
|
||||
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
||||
|
||||
switch (type) {
|
||||
case GGUF_TYPE_STRING:
|
||||
return gguf_get_val_str(ctx_gguf, i);
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
||||
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
||||
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (int j = 0; j < arr_n; j++) {
|
||||
if (arr_type == GGUF_TYPE_STRING) {
|
||||
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
||||
// escape quotes
|
||||
replace_all(val, "\\", "\\\\");
|
||||
replace_all(val, "\"", "\\\"");
|
||||
ss << '"' << val << '"';
|
||||
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
||||
ss << "???";
|
||||
} else {
|
||||
ss << gguf_data_to_str(arr_type, data, j);
|
||||
}
|
||||
if (j < arr_n - 1) {
|
||||
ss << ", ";
|
||||
}
|
||||
}
|
||||
ss << "]";
|
||||
return ss.str();
|
||||
}
|
||||
default:
|
||||
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
|
||||
size_t tensor_size = ggml_nbytes(tensor);
|
||||
printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
||||
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
|
||||
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
|
||||
}
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & name) {
|
||||
for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
|
||||
if (kv.second == name) {
|
||||
return kv.first;
|
||||
}
|
||||
}
|
||||
return PROJECTOR_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
//
|
||||
// image data
|
||||
//
|
||||
@@ -218,10 +305,44 @@ struct clip_vision_model {
|
||||
struct ggml_tensor * projection;
|
||||
|
||||
// LLaVA projection
|
||||
struct ggml_tensor * mm_0_w;
|
||||
struct ggml_tensor * mm_0_b;
|
||||
struct ggml_tensor * mm_2_w;
|
||||
struct ggml_tensor * mm_2_b;
|
||||
struct ggml_tensor * mm_0_w = NULL;
|
||||
struct ggml_tensor * mm_0_b = NULL;
|
||||
struct ggml_tensor * mm_2_w = NULL;
|
||||
struct ggml_tensor * mm_2_b = NULL;
|
||||
|
||||
// Yi type models with mlp+normalization projection
|
||||
struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
|
||||
struct ggml_tensor * mm_1_b = NULL;
|
||||
struct ggml_tensor * mm_3_w = NULL;
|
||||
struct ggml_tensor * mm_3_b = NULL;
|
||||
struct ggml_tensor * mm_4_w = NULL;
|
||||
struct ggml_tensor * mm_4_b = NULL;
|
||||
|
||||
// MobileVLM projection
|
||||
struct ggml_tensor * mm_model_mlp_1_w;
|
||||
struct ggml_tensor * mm_model_mlp_1_b;
|
||||
struct ggml_tensor * mm_model_mlp_3_w;
|
||||
struct ggml_tensor * mm_model_mlp_3_b;
|
||||
struct ggml_tensor * mm_model_block_1_block_0_0_w;
|
||||
struct ggml_tensor * mm_model_block_1_block_0_1_w;
|
||||
struct ggml_tensor * mm_model_block_1_block_0_1_b;
|
||||
struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
|
||||
struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
|
||||
struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
|
||||
struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
|
||||
struct ggml_tensor * mm_model_block_1_block_2_0_w;
|
||||
struct ggml_tensor * mm_model_block_1_block_2_1_w;
|
||||
struct ggml_tensor * mm_model_block_1_block_2_1_b;
|
||||
struct ggml_tensor * mm_model_block_2_block_0_0_w;
|
||||
struct ggml_tensor * mm_model_block_2_block_0_1_w;
|
||||
struct ggml_tensor * mm_model_block_2_block_0_1_b;
|
||||
struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
|
||||
struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
|
||||
struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
|
||||
struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
|
||||
struct ggml_tensor * mm_model_block_2_block_2_0_w;
|
||||
struct ggml_tensor * mm_model_block_2_block_2_1_w;
|
||||
struct ggml_tensor * mm_model_block_2_block_2_1_b;
|
||||
};
|
||||
|
||||
struct clip_ctx {
|
||||
@@ -230,6 +351,7 @@ struct clip_ctx {
|
||||
bool has_llava_projector = false;
|
||||
|
||||
struct clip_vision_model vision_model;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
@@ -347,6 +469,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
// pre-layernorm
|
||||
{
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
ggml_set_name(embeddings, "pre_ln");
|
||||
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
|
||||
}
|
||||
@@ -447,16 +570,156 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
free(patches_data);
|
||||
}
|
||||
|
||||
// shape [1, 576, 1024]
|
||||
// ne is whcn, ne = [1024, 576, 1, 1]
|
||||
embeddings = ggml_get_rows(ctx0, embeddings, patches);
|
||||
|
||||
// mm projection 0
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
// print_tensor_info(embeddings, "embeddings");
|
||||
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
// llava projector
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
||||
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
|
||||
// First LayerNorm
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
|
||||
model.mm_1_b);
|
||||
|
||||
// GELU activation
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
|
||||
// Second linear layer
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
|
||||
|
||||
// Second LayerNorm
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
|
||||
model.mm_4_b);
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
// MobileVLM projector
|
||||
int n_patch = 24;
|
||||
struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
|
||||
mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
|
||||
mlp_1 = ggml_gelu(ctx0, mlp_1);
|
||||
struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
|
||||
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
|
||||
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
|
||||
|
||||
// block 1
|
||||
struct ggml_tensor * block_1 = nullptr;
|
||||
{
|
||||
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
|
||||
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
|
||||
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
|
||||
// stride = 1, padding = 1, bias is nullptr
|
||||
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
|
||||
|
||||
// layer norm
|
||||
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
|
||||
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||
|
||||
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||
// hardswish
|
||||
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
||||
|
||||
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
|
||||
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
// pointwise conv
|
||||
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
|
||||
block_1 = ggml_relu(ctx0, block_1);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
|
||||
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
||||
// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
||||
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
||||
|
||||
int w = block_1->ne[0], h = block_1->ne[1];
|
||||
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||
|
||||
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||
|
||||
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||
// residual
|
||||
block_1 = ggml_add(ctx0, mlp_3, block_1);
|
||||
}
|
||||
|
||||
// block_2
|
||||
{
|
||||
// stride = 2
|
||||
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
|
||||
|
||||
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
||||
// layer norm
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
|
||||
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
||||
// hardswish
|
||||
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
||||
|
||||
// not sure the parameters is right for globalAvgPooling
|
||||
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
|
||||
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
// pointwise conv
|
||||
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
|
||||
block_1 = ggml_relu(ctx0, block_1);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
|
||||
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
||||
|
||||
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
||||
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
||||
|
||||
int w = block_1->ne[0], h = block_1->ne[1];
|
||||
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||
|
||||
|
||||
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
|
||||
block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
|
||||
// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
|
||||
}
|
||||
embeddings = block_1;
|
||||
}
|
||||
else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
// build the graph
|
||||
@@ -502,16 +765,47 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
printf("\n");
|
||||
}
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
|
||||
// kv
|
||||
if (verbosity >= 3) {
|
||||
const int n_kv = gguf_get_n_kv(ctx);
|
||||
const int n_kv = gguf_get_n_kv(ctx);
|
||||
printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
|
||||
__func__, n_kv, n_tensors, fname);
|
||||
{
|
||||
std::map<enum ggml_type, uint32_t> n_type;
|
||||
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
const char * key = gguf_get_key(ctx, i);
|
||||
for (int i = 0; i < n_tensors; i++) {
|
||||
enum ggml_type type = gguf_get_tensor_type(ctx, i);
|
||||
|
||||
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
|
||||
n_type[type]++;
|
||||
}
|
||||
|
||||
printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
||||
for (int i = 0; i < n_kv; i++) {
|
||||
const char * name = gguf_get_key(ctx, i);
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx, i);
|
||||
const std::string type_name =
|
||||
type == GGUF_TYPE_ARRAY
|
||||
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
|
||||
: gguf_type_name(type);
|
||||
|
||||
std::string value = gguf_kv_to_str(ctx, i);
|
||||
const size_t MAX_VALUE_LEN = 40;
|
||||
if (value.size() > MAX_VALUE_LEN) {
|
||||
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
||||
}
|
||||
replace_all(value, "\n", "\\n");
|
||||
|
||||
printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
||||
}
|
||||
|
||||
// print type counts
|
||||
for (auto & kv : n_type) {
|
||||
if (kv.second == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// data
|
||||
@@ -520,12 +814,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||
enum ggml_type type = gguf_get_tensor_type(ctx, i);
|
||||
struct ggml_tensor * cur = ggml_get_tensor(meta, name);
|
||||
size_t tensor_size = ggml_nbytes(cur);
|
||||
buffer_size += tensor_size;
|
||||
if (verbosity >= 3) {
|
||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i,
|
||||
ggml_n_dims(cur), cur->name, tensor_size, offset);
|
||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
||||
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -533,6 +828,24 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
buffer_size += n_tensors * 128 /* CLIP PADDING */;
|
||||
|
||||
clip_ctx * new_clip = new clip_ctx;
|
||||
|
||||
// update projector type
|
||||
{
|
||||
int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
|
||||
if (idx != -1) {
|
||||
const std::string proj_type = gguf_get_val_str(ctx, idx);
|
||||
new_clip->proj_type = clip_projector_type_from_string(proj_type);
|
||||
}
|
||||
else {
|
||||
new_clip->proj_type = PROJECTOR_TYPE_MLP;
|
||||
}
|
||||
if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
|
||||
if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
|
||||
new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
new_clip->backend = ggml_backend_cuda_init(0);
|
||||
printf("%s: CLIP using CUDA backend\n", __func__);
|
||||
@@ -543,6 +856,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
printf("%s: CLIP using Metal backend\n", __func__);
|
||||
#endif
|
||||
|
||||
|
||||
if (!new_clip->backend) {
|
||||
new_clip->backend = ggml_backend_cpu_init();
|
||||
printf("%s: CLIP using CPU backend\n", __func__);
|
||||
@@ -676,10 +990,63 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
|
||||
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
|
||||
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
|
||||
// LLaVA projection
|
||||
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
try {
|
||||
// Yi-type llava
|
||||
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
|
||||
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
try {
|
||||
// missing in Yi-type llava
|
||||
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
try {
|
||||
// Yi-type llava
|
||||
vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
|
||||
vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
try {
|
||||
// Yi-type llava
|
||||
vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
|
||||
vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
}
|
||||
else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
// MobileVLM projection
|
||||
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
||||
vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
|
||||
vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
||||
vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
||||
vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
|
||||
vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
|
||||
vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
|
||||
vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
|
||||
vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
|
||||
vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
|
||||
vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
|
||||
vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
|
||||
vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
|
||||
vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
|
||||
vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
|
||||
vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
|
||||
vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
|
||||
vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
|
||||
vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
|
||||
vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
|
||||
vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
|
||||
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
|
||||
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
|
||||
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
|
||||
}
|
||||
else {
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
}
|
||||
|
||||
vision_model.layers.resize(hparams.n_layer);
|
||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||
@@ -931,26 +1298,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||
|
||||
ggml_type type = GGML_TYPE_Q4_1;
|
||||
|
||||
switch (itype) {
|
||||
case 2:
|
||||
type = GGML_TYPE_Q4_0;
|
||||
break;
|
||||
case 3:
|
||||
type = GGML_TYPE_Q4_1;
|
||||
break;
|
||||
case 6:
|
||||
type = GGML_TYPE_Q5_0;
|
||||
break;
|
||||
case 7:
|
||||
type = GGML_TYPE_Q5_1;
|
||||
break;
|
||||
case 8:
|
||||
type = GGML_TYPE_Q8_0;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
|
||||
return false;
|
||||
};
|
||||
assert(itype < GGML_TYPE_COUNT);
|
||||
type = static_cast<ggml_type>(itype);
|
||||
|
||||
auto * ctx_clip = clip_model_load(fname_inp, 2);
|
||||
|
||||
@@ -982,7 +1331,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||
".*weight",
|
||||
};
|
||||
|
||||
std::vector<uint8_t> read_data(512);
|
||||
std::vector<uint8_t> work(512);
|
||||
std::vector<float> conv_buf(512);
|
||||
std::vector<int64_t> hist_all(1 << 4, 0);
|
||||
@@ -1010,6 +1358,10 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||
|
||||
if (quantize) {
|
||||
new_type = type;
|
||||
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
|
||||
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
|
||||
// fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
|
||||
}
|
||||
const size_t n_elms = ggml_nelements(cur);
|
||||
float * f32_data;
|
||||
|
||||
@@ -1054,6 +1406,21 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||
case GGML_TYPE_Q8_0: {
|
||||
new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q2_K: {
|
||||
new_size = ggml_quantize_q2_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q3_K: {
|
||||
new_size = ggml_quantize_q3_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q4_K: {
|
||||
new_size = ggml_quantize_q4_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q5_K: {
|
||||
new_size = ggml_quantize_q5_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q6_K: {
|
||||
new_size = ggml_quantize_q6_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
||||
} break;
|
||||
default: {
|
||||
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
|
||||
return false;
|
||||
@@ -1114,13 +1481,27 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||
}
|
||||
|
||||
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
return ctx->vision_model.mm_2_b->ne[0];
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
|
||||
return ctx->vision_model.mm_2_b->ne[0];
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||
return ctx->vision_model.mm_3_b->ne[0];
|
||||
}
|
||||
else {
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
int clip_n_patches(const struct clip_ctx * ctx) {
|
||||
auto & params = ctx->vision_model.hparams;
|
||||
|
||||
return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
||||
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
n_patches /= 4;
|
||||
}
|
||||
return n_patches;
|
||||
}
|
||||
|
||||
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
||||
|
||||
@@ -81,6 +81,7 @@ ap.add_argument("--vision-only", action="store_true", required=False,
|
||||
ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
|
||||
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
|
||||
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
||||
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
|
||||
ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
|
||||
ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
|
||||
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||
@@ -174,6 +175,8 @@ elif args.vision_only and not has_llava_projector:
|
||||
fout.add_description("vision-only CLIP model")
|
||||
elif has_llava_projector:
|
||||
fout.add_description("image encoder for LLaVA")
|
||||
# add projector type
|
||||
fout.add_string("clip.projector_type", args.projector_type)
|
||||
else:
|
||||
fout.add_description("two-tower CLIP model")
|
||||
|
||||
@@ -218,7 +221,8 @@ if has_llava_projector:
|
||||
projector = torch.load(args.llava_projector)
|
||||
for name, data in projector.items():
|
||||
name = get_tensor_name(name)
|
||||
if data.ndim == 2:
|
||||
# pw and dw conv ndim==4
|
||||
if data.ndim == 2 or data.ndim == 4:
|
||||
data = data.squeeze().numpy().astype(np.float16)
|
||||
else:
|
||||
data = data.squeeze().numpy().astype(np.float32)
|
||||
|
||||
@@ -148,10 +148,35 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
|
||||
|
||||
// llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
|
||||
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
|
||||
std::string system_prompt, user_prompt;
|
||||
size_t image_pos = prompt.find("<image>");
|
||||
if (image_pos != std::string::npos) {
|
||||
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
||||
|
||||
system_prompt = prompt.substr(0, image_pos);
|
||||
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
||||
// We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string
|
||||
size_t pos = 0;
|
||||
while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) {
|
||||
user_prompt.replace(pos, 2, "\n");
|
||||
pos += 1; // Advance past the replaced newline
|
||||
}
|
||||
while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) {
|
||||
system_prompt.replace(pos, 2, "\n");
|
||||
pos += 1; // Advance past the replaced newline
|
||||
}
|
||||
|
||||
printf("system_prompt: %s\n", system_prompt.c_str());
|
||||
printf("user_prompt: %s\n", user_prompt.c_str());
|
||||
} else {
|
||||
// llava-1.5 native mode
|
||||
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
|
||||
user_prompt = prompt + "\nASSISTANT:";
|
||||
}
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
|
||||
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
||||
eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
|
||||
// generate the response
|
||||
|
||||
@@ -162,6 +187,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
|
||||
printf("%s", tmp);
|
||||
fflush(stdout);
|
||||
@@ -243,6 +269,9 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
auto image_embed = load_image(ctx_llava, ¶ms);
|
||||
if (!image_embed) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
@@ -39,6 +39,17 @@ static std::ostringstream * g_output_ss;
|
||||
static std::vector<llama_token> * g_output_tokens;
|
||||
static bool is_interacting = false;
|
||||
|
||||
static bool file_exists(const std::string &path) {
|
||||
std::ifstream f(path.c_str());
|
||||
return f.good();
|
||||
}
|
||||
|
||||
static bool file_is_empty(const std::string &path) {
|
||||
std::ifstream f;
|
||||
f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
||||
f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
|
||||
return f.tellg() == 0;
|
||||
}
|
||||
|
||||
static void write_logfile(
|
||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
||||
@@ -215,12 +226,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (!path_session.empty()) {
|
||||
LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
||||
|
||||
// fopen to check for existing session
|
||||
FILE * fp = std::fopen(path_session.c_str(), "rb");
|
||||
if (fp != NULL) {
|
||||
std::fclose(fp);
|
||||
|
||||
if (!file_exists(path_session)) {
|
||||
LOG_TEE("%s: session file does not exist, will create.\n", __func__);
|
||||
} else if (file_is_empty(path_session)) {
|
||||
LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
|
||||
} else {
|
||||
// The file exists and is not empty
|
||||
session_tokens.resize(n_ctx);
|
||||
size_t n_token_count_out = 0;
|
||||
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
||||
@@ -229,10 +240,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
session_tokens.resize(n_token_count_out);
|
||||
llama_set_rng_seed(ctx, params.seed);
|
||||
|
||||
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
|
||||
} else {
|
||||
LOG_TEE("%s: session file does not exist, will create\n", __func__);
|
||||
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -439,6 +447,21 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
|
||||
LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
|
||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
|
||||
// group-attention state
|
||||
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
|
||||
int ga_i = 0;
|
||||
|
||||
const int ga_n = params.grp_attn_n;
|
||||
const int ga_w = params.grp_attn_w;
|
||||
|
||||
if (ga_n != 1) {
|
||||
GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT
|
||||
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
|
||||
LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
|
||||
}
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
if (params.interactive) {
|
||||
@@ -462,6 +485,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
bool is_antiprompt = false;
|
||||
bool input_echo = true;
|
||||
bool display = true;
|
||||
bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();
|
||||
|
||||
int n_past = 0;
|
||||
@@ -476,6 +500,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// the first thing we will do is to output the prompt, so set color accordingly
|
||||
console::set_display(console::prompt);
|
||||
display = params.display_prompt;
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
std::vector<llama_token> embd_guidance;
|
||||
@@ -485,7 +510,7 @@ int main(int argc, char ** argv) {
|
||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||
// predict
|
||||
if (!embd.empty()) {
|
||||
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
|
||||
// Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
|
||||
// --prompt or --file which uses the same value.
|
||||
int max_embd_size = n_ctx - 4;
|
||||
|
||||
@@ -500,37 +525,61 @@ int main(int argc, char ** argv) {
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
// infinite text generation via context swapping
|
||||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
|
||||
if (params.n_predict == -2) {
|
||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
if (ga_n == 1) {
|
||||
// infinite text generation via context shifting
|
||||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
|
||||
if (params.n_predict == -2) {
|
||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
}
|
||||
|
||||
const int n_left = n_past - params.n_keep - 1;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
if (ctx_guidance) {
|
||||
n_past_guidance -= n_discard;
|
||||
}
|
||||
|
||||
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
|
||||
|
||||
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
||||
|
||||
LOG("clear session path\n");
|
||||
path_session.clear();
|
||||
}
|
||||
} else {
|
||||
// context extension via Self-Extend
|
||||
while (n_past >= ga_i + ga_w) {
|
||||
const int ib = (ga_n*ga_i)/ga_w;
|
||||
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
||||
const int dd = (ga_w/ga_n) - ib*bd - ga_w;
|
||||
|
||||
const int n_left = n_past - params.n_keep - 1;
|
||||
const int n_discard = n_left/2;
|
||||
LOG("\n");
|
||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
|
||||
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
||||
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, ga_i, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div (ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||
llama_kv_cache_seq_shift(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
n_past -= bd;
|
||||
|
||||
n_past -= n_discard;
|
||||
ga_i += ga_w/ga_n;
|
||||
|
||||
if (ctx_guidance) {
|
||||
n_past_guidance -= n_discard;
|
||||
LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
|
||||
}
|
||||
|
||||
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
|
||||
|
||||
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
||||
|
||||
LOG("clear session path\n");
|
||||
path_session.clear();
|
||||
}
|
||||
|
||||
// try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
|
||||
@@ -611,6 +660,10 @@ int main(int argc, char ** argv) {
|
||||
n_past += n_eval;
|
||||
|
||||
LOG("n_past = %d\n", n_past);
|
||||
// Display total tokens alongside total time
|
||||
if (params.n_print > 0 && n_past % params.n_print == 0) {
|
||||
LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
|
||||
}
|
||||
}
|
||||
|
||||
if (!embd.empty() && !path_session.empty()) {
|
||||
@@ -664,7 +717,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// display text
|
||||
if (input_echo) {
|
||||
if (input_echo && display) {
|
||||
for (auto id : embd) {
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
printf("%s", token_str.c_str());
|
||||
@@ -681,6 +734,7 @@ int main(int argc, char ** argv) {
|
||||
// reset color to default if there is no pending user input
|
||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||
console::set_display(console::reset);
|
||||
display = true;
|
||||
}
|
||||
|
||||
// if not currently processing queued inputs;
|
||||
@@ -753,6 +807,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// color user input only
|
||||
console::set_display(console::user_input);
|
||||
display = params.display_prompt;
|
||||
|
||||
std::string line;
|
||||
bool another_line = true;
|
||||
@@ -763,6 +818,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// done taking input, reset color
|
||||
console::set_display(console::reset);
|
||||
display = true;
|
||||
|
||||
// Add tokens to embd only if the input buffer is non-empty
|
||||
// Entering a empty line lets the user pass control back
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
set(TEST_TARGET metal)
|
||||
add_executable(${TEST_TARGET} metal.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
|
||||
@@ -1,103 +0,0 @@
|
||||
// Evaluate a statically exported ggml computation graph with Metal
|
||||
//
|
||||
// - First, export a LLaMA graph:
|
||||
//
|
||||
// $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export
|
||||
//
|
||||
// - Run this tool to evaluate the exported graph:
|
||||
//
|
||||
// $ ./bin/metal llama.ggml
|
||||
//
|
||||
// The purpose of this tool is mostly for debugging and demonstration purposes.
|
||||
// The main limitation of exporting computation graphs is that their sizes are static which often
|
||||
// can be a problem for real-world applications.
|
||||
//
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-metal.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
if (argc != 2) {
|
||||
fprintf(stderr, "Usage: %s llama.ggml\n", argv[0]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
const char * fname_cgraph = argv[1];
|
||||
|
||||
// load the compute graph
|
||||
struct ggml_context * ctx_data = NULL;
|
||||
struct ggml_context * ctx_eval = NULL;
|
||||
|
||||
struct ggml_cgraph * gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
|
||||
|
||||
// this allocates all Metal resources and memory buffers
|
||||
auto * ctx_metal = ggml_metal_init(1);
|
||||
|
||||
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
|
||||
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
|
||||
ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data), max_size_data);
|
||||
ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval), max_size_eval);
|
||||
|
||||
// main
|
||||
{
|
||||
struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
|
||||
*(int32_t *) input->data = 1; // BOS
|
||||
|
||||
ggml_metal_set_tensor(ctx_metal, input);
|
||||
|
||||
// warmup
|
||||
ggml_metal_graph_compute(ctx_metal, gf);
|
||||
|
||||
const int n_iter = 16;
|
||||
|
||||
const int64_t t0 = ggml_time_us();
|
||||
|
||||
// the actual inference happens here
|
||||
for (int i = 0; i < n_iter; ++i) {
|
||||
ggml_metal_graph_compute(ctx_metal, gf);
|
||||
}
|
||||
|
||||
const int64_t t1 = ggml_time_us();
|
||||
|
||||
printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
|
||||
}
|
||||
|
||||
// debug output
|
||||
{
|
||||
struct ggml_tensor * logits = gf->nodes[gf->n_nodes - 1];
|
||||
ggml_metal_get_tensor(ctx_metal, logits);
|
||||
|
||||
float * ptr = (float *) ggml_get_data(logits);
|
||||
|
||||
printf("logits: ");
|
||||
for (int i = 0; i < 10; i++) {
|
||||
printf("%8.4f ", ptr[i]);
|
||||
}
|
||||
printf("\n");
|
||||
int imax = 0;
|
||||
double sum = 0.0;
|
||||
double vmax = -1e9;
|
||||
for (int i = 0; i < 32000; i++) {
|
||||
sum += (double) ptr[i];
|
||||
if (ptr[i] > vmax) {
|
||||
vmax = ptr[i];
|
||||
imax = i;
|
||||
}
|
||||
}
|
||||
printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
|
||||
}
|
||||
|
||||
ggml_metal_free(ctx_metal);
|
||||
|
||||
ggml_free(ctx_data);
|
||||
ggml_free(ctx_eval);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||