Compare commits

..

1 Commits

Author SHA1 Message Date
Jared Van Bortel
7c87353e61 common : remove incorrect --model-draft default 2023-12-21 12:17:12 -05:00
175 changed files with 12826 additions and 25283 deletions

View File

@@ -14,8 +14,7 @@ ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git
COPY requirements.txt requirements.txt
COPY requirements requirements
COPY requirements.txt requirements.txt
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt

View File

@@ -23,8 +23,7 @@ ARG ROCM_DOCKER_ARCH=\
gfx1101 \
gfx1102
COPY requirements.txt requirements.txt
COPY requirements requirements
COPY requirements.txt requirements.txt
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt

View File

@@ -5,8 +5,7 @@ FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git
COPY requirements.txt requirements.txt
COPY requirements requirements
COPY requirements.txt requirements.txt
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt

View File

@@ -23,8 +23,7 @@ ARG ROCM_DOCKER_ARCH=\
gfx1101 \
gfx1102
COPY requirements.txt requirements.txt
COPY requirements requirements
COPY requirements.txt requirements.txt
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt

View File

@@ -1,22 +0,0 @@
{
perSystem =
{ config, lib, ... }:
{
apps =
let
inherit (config.packages) default;
binaries = [
"llama"
"llama-embedding"
"llama-server"
"quantize"
"train-text-from-scratch"
];
mkApp = name: {
type = "app";
program = "${default}/bin/${name}";
};
in
lib.genAttrs binaries mkApp;
};
}

View File

@@ -1,13 +0,0 @@
{
perSystem =
{ config, lib, ... }:
{
devShells =
lib.concatMapAttrs
(name: package: {
${name} = package.passthru.shell;
${name + "-extra"} = package.passthru.shell-extra;
})
config.packages;
};
}

View File

@@ -1,39 +0,0 @@
{ inputs, ... }:
{
perSystem =
{
config,
system,
lib,
pkgsCuda,
...
}:
{
legacyPackages =
let
caps.llamaPackagesXavier = "7.2";
caps.llamaPackagesOrin = "8.7";
caps.llamaPackagesTX2 = "6.2";
caps.llamaPackagesNano = "5.3";
pkgsFor =
cap:
import inputs.nixpkgs {
inherit system;
config = {
cudaSupport = true;
cudaCapabilities = [ cap ];
cudaEnableForwardCompat = false;
inherit (pkgsCuda.config) allowUnfreePredicate;
};
};
in
builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
packages = lib.optionalAttrs (system == "aarch64-linux") {
jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
};
};
}

View File

@@ -1,35 +0,0 @@
{ inputs, ... }:
{
# The _module.args definitions are passed on to modules as arguments. E.g.
# the module `{ pkgs ... }: { /* config */ }` implicitly uses
# `_module.args.pkgs` (defined in this case by flake-parts).
perSystem =
{ system, ... }:
{
_module.args = {
pkgsCuda = import inputs.nixpkgs {
inherit system;
# Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
# and ucx are built with CUDA support)
config.cudaSupport = true;
config.allowUnfreePredicate =
p:
builtins.all
(
license:
license.free
|| builtins.elem license.shortName [
"CUDA EULA"
"cuDNN EULA"
]
)
(p.meta.licenses or [ p.meta.license ]);
};
# Ensure dependencies use ROCm consistently
pkgsRocm = import inputs.nixpkgs {
inherit system;
config.rocmSupport = true;
};
};
};
}

View File

@@ -1,265 +0,0 @@
{
lib,
config,
stdenv,
mkShell,
cmake,
ninja,
pkg-config,
git,
python3,
mpi,
openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
cudaPackages,
darwin,
rocmPackages,
clblast,
useBlas ? builtins.all (x: !x) [
useCuda
useMetalKit
useOpenCL
useRocm
],
useCuda ? config.cudaSupport,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
useMpi ? false, # Increases the runtime closure size by ~700M
useOpenCL ? false,
useRocm ? config.rocmSupport,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
}@inputs:
let
inherit (lib)
cmakeBool
cmakeFeature
optionals
strings
versionOlder
;
# It's necessary to consistently use backendStdenv when building with CUDA support,
# otherwise we get libstdc++ errors downstream.
stdenv = throw "Use effectiveStdenv instead";
effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
suffices =
lib.optionals useBlas [ "BLAS" ]
++ lib.optionals useCuda [ "CUDA" ]
++ lib.optionals useMetalKit [ "MetalKit" ]
++ lib.optionals useMpi [ "MPI" ]
++ lib.optionals useOpenCL [ "OpenCL" ]
++ lib.optionals useRocm [ "ROCm" ];
pnameSuffix =
strings.optionalString (suffices != [ ])
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
descriptionSuffix =
strings.optionalString (suffices != [ ])
", accelerated with ${strings.concatStringsSep ", " suffices}";
# TODO: package the Python in this repository in a Nix-like way.
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
# https://peps.python.org/pep-0517/
llama-python = python3.withPackages (
ps: [
ps.numpy
ps.sentencepiece
]
);
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
llama-python-extra = python3.withPackages (
ps: [
ps.numpy
ps.sentencepiece
ps.torchWithoutCuda
ps.transformers
]
);
# apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
# separately
darwinBuildInputs =
with darwin.apple_sdk.frameworks;
[
Accelerate
CoreVideo
CoreGraphics
]
++ optionals useMetalKit [ MetalKit ];
cudaBuildInputs = with cudaPackages; [
cuda_cccl.dev # <nv/target>
# A temporary hack for reducing the closure size, remove once cudaPackages
# have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
cuda_cudart.dev
cuda_cudart.lib
cuda_cudart.static
libcublas.dev
libcublas.lib
libcublas.static
];
rocmBuildInputs = with rocmPackages; [
clr
hipblas
rocblas
];
in
effectiveStdenv.mkDerivation (
finalAttrs: {
pname = "llama-cpp${pnameSuffix}";
version = llamaVersion;
src = lib.cleanSourceWith {
filter =
name: type:
!(builtins.any (_: _) [
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
(name == "README.md") # Ignore *.md changes whe computing outPaths
(lib.hasPrefix "." name) # Skip hidden files and directories
]);
src = lib.cleanSource ../../.;
};
postPatch = ''
substituteInPlace ./ggml-metal.m \
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
# TODO: Package up each Python script or service appropriately.
# If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
# we could make those *.py into setuptools' entrypoints
substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
'';
nativeBuildInputs =
[
cmake
ninja
pkg-config
git
]
++ optionals useCuda [
cudaPackages.cuda_nvcc
# TODO: Replace with autoAddDriverRunpath
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
cudaPackages.autoAddOpenGLRunpathHook
];
buildInputs =
optionals effectiveStdenv.isDarwin darwinBuildInputs
++ optionals useCuda cudaBuildInputs
++ optionals useMpi [ mpi ]
++ optionals useOpenCL [ clblast ]
++ optionals useRocm rocmBuildInputs;
cmakeFlags =
[
(cmakeBool "LLAMA_NATIVE" true)
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" true)
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_BLAS" useBlas)
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
(cmakeBool "LLAMA_CUBLAS" useCuda)
(cmakeBool "LLAMA_HIPBLAS" useRocm)
(cmakeBool "LLAMA_METAL" useMetalKit)
(cmakeBool "LLAMA_MPI" useMpi)
]
++ optionals useCuda [
(
with cudaPackages.flags;
cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
)
)
]
++ optionals useRocm [
(cmakeFeature "CMAKE_C_COMPILER" "hipcc")
(cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
# in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
# and select the line that matches the current nixpkgs version of rocBLAS.
# Should likely use `rocmPackages.clr.gpuTargets`.
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
]
++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
# if they haven't been added yet.
postInstall = ''
mv $out/bin/main $out/bin/llama
mv $out/bin/server $out/bin/llama-server
mkdir -p $out/include
cp $src/llama.h $out/include/
'';
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
passthru = {
inherit
useBlas
useCuda
useMetalKit
useMpi
useOpenCL
useRocm
;
shell = mkShell {
name = "shell-${finalAttrs.finalPackage.name}";
description = "contains numpy and sentencepiece";
buildInputs = [ llama-python ];
inputsFrom = [ finalAttrs.finalPackage ];
};
shell-extra = mkShell {
name = "shell-extra-${finalAttrs.finalPackage.name}";
description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
buildInputs = [ llama-python-extra ];
inputsFrom = [ finalAttrs.finalPackage ];
};
};
meta = {
# Configurations we don't want even the CI to evaluate. Results in the
# "unsupported platform" messages. This is mostly a no-op, because
# cudaPackages would've refused to evaluate anyway.
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
# Configurations that are known to result in build failures. Can be
# overridden by importing Nixpkgs with `allowBroken = true`.
broken = (useMetalKit && !effectiveStdenv.isDarwin);
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
homepage = "https://github.com/ggerganov/llama.cpp/";
license = lib.licenses.mit;
# Accommodates `nix run` and `lib.getExe`
mainProgram = "llama";
# These people might respond, on the best effort basis, if you ping them
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
# Consider adding yourself to this list if you want to ensure this flake
# stays maintained and you're willing to invest your time. Do not add
# other people without their consent. Consider removing people after
# they've been unreachable for long periods of time.
# Note that lib.maintainers is defined in Nixpkgs, but you may just add
# an attrset following the same format as in
# https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
maintainers = with lib.maintainers; [
philiptaron
SomeoneSerge
];
# Extend `badPlatforms` instead
platforms = lib.platforms.all;
};
}
)

View File

@@ -1,12 +0,0 @@
{
lib,
newScope,
llamaVersion ? "0.0.0",
}:
lib.makeScope newScope (
self: {
inherit llamaVersion;
llama-cpp = self.callPackage ./package.nix { };
}
)

View File

@@ -6,4 +6,179 @@ assignees: ''
---
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
# Prerequisites
Please answer the following questions for yourself before submitting an issue.
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
# Expected Behavior
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do.
# Current Behavior
Please provide a detailed written description of what `llama.cpp` did, instead.
# Environment and Context
Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
* Physical (or virtual) hardware you are using, e.g. for Linux:
`$ lscpu`
* Operating System, e.g. for Linux:
`$ uname -a`
* SDK version, e.g. for Linux:
```
$ python3 --version
$ make --version
$ g++ --version
```
# Failure Information (for bugs)
Please help provide information about the failure / bug.
# Steps to Reproduce
Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
1. step 1
2. step 2
3. step 3
4. etc.
# Failure Logs
Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
Example environment info:
```
llama.cpp$ git log | head -1
commit 2af23d30434a677c6416812eea52ccc0af65119c
llama.cpp$ lscpu | egrep "AMD|Flags"
Vendor ID: AuthenticAMD
Model name: AMD Ryzen Threadripper 1950X 16-Core Processor
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid amd_dcm aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb hw_pstate ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 xsaves clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif overflow_recov succor smca sme sev
Virtualization: AMD-V
llama.cpp$ python3 --version
Python 3.10.9
llama.cpp$ pip list | egrep "torch|numpy|sentencepiece"
numpy 1.24.2
numpydoc 1.5.0
sentencepiece 0.1.97
torch 1.13.1
torchvision 0.14.1
llama.cpp$ make --version | head -1
GNU Make 4.3
$ md5sum ./models/65B/ggml-model-q4_0.bin
dbdd682cce80e2d6e93cefc7449df487 ./models/65B/ggml-model-q4_0.bin
```
Example run with the Linux command [perf](https://www.brendangregg.com/perf.html)
```
llama.cpp$ perf stat ./main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p "Please close your issue when it has been answered."
main: seed = 1679149377
llama_model_load: loading model from './models/65B/ggml-model-q4_0.bin' - please wait ...
llama_model_load: n_vocab = 32000
llama_model_load: n_ctx = 512
llama_model_load: n_embd = 8192
llama_model_load: n_mult = 256
llama_model_load: n_head = 64
llama_model_load: n_layer = 80
llama_model_load: n_rot = 128
llama_model_load: f16 = 2
llama_model_load: n_ff = 22016
llama_model_load: n_parts = 8
llama_model_load: ggml ctx size = 41477.73 MB
llama_model_load: memory_size = 2560.00 MB, n_mem = 40960
llama_model_load: loading model part 1/8 from './models/65B/ggml-model-q4_0.bin'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
llama_model_load: loading model part 2/8 from './models/65B/ggml-model-q4_0.bin.1'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
llama_model_load: loading model part 3/8 from './models/65B/ggml-model-q4_0.bin.2'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
llama_model_load: loading model part 4/8 from './models/65B/ggml-model-q4_0.bin.3'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
llama_model_load: loading model part 5/8 from './models/65B/ggml-model-q4_0.bin.4'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
llama_model_load: loading model part 6/8 from './models/65B/ggml-model-q4_0.bin.5'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
llama_model_load: loading model part 7/8 from './models/65B/ggml-model-q4_0.bin.6'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.7'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
main: prompt: 'Please close your issue when it has been answered.'
main: number of tokens in prompt = 11
1 -> ''
12148 -> 'Please'
3802 -> ' close'
596 -> ' your'
2228 -> ' issue'
746 -> ' when'
372 -> ' it'
756 -> ' has'
1063 -> ' been'
7699 -> ' answered'
29889 -> '.'
sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
Please close your issue when it has been answered.
@duncan-donut: I'm trying to figure out what kind of "support" you need for this script and why, exactly? Is there a question about how the code works that hasn't already been addressed in one or more comments below this ticket, or are we talking something else entirely like some sorta bugfixing job because your server setup is different from mine??
I can understand if your site needs to be running smoothly and you need help with a fix of sorts but there should really be nothing wrong here that the code itself could not handle. And given that I'm getting reports about how it works perfectly well on some other servers, what exactly are we talking? A detailed report will do wonders in helping us get this resolved for ya quickly so please take your time and describe the issue(s) you see as clearly & concisely as possible!!
@duncan-donut: I'm not sure if you have access to cPanel but you could try these instructions. It is worth a shot! Let me know how it goes (or what error message, exactly!) when/if ya give that code a go? [end of text]
main: mem per token = 71159620 bytes
main: load time = 19309.95 ms
main: sample time = 168.62 ms
main: predict time = 223895.61 ms / 888.47 ms per token
main: total time = 246406.42 ms
Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
3636882.89 msec task-clock # 14.677 CPUs utilized
13509 context-switches # 3.714 /sec
2436 cpu-migrations # 0.670 /sec
10476679 page-faults # 2.881 K/sec
13133115082869 cycles # 3.611 GHz (16.77%)
29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%)
10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%)
23479217109614 instructions # 1.79 insn per cycle
# 0.44 stalled cycles per insn (16.76%)
2353072268027 branches # 647.002 M/sec (16.77%)
1998682780 branch-misses # 0.08% of all branches (16.76%)
247.802177522 seconds time elapsed
3618.573072000 seconds user
18.491698000 seconds sys
```

View File

@@ -515,30 +515,6 @@ jobs:
- name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
android-build:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v3
- name: Set up JDK
uses: actions/setup-java@v3
with:
java-version: 17
distribution: zulu
- name: Setup Android SDK
uses: android-actions/setup-android@v3
with:
log-accepted-android-sdk-licenses: false
- name: Build
run: |
cd examples/llama.android
# Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
./gradlew build --no-daemon -Pskip-armeabi-v7a
# freeBSD-latest:
# runs-on: macos-12

View File

@@ -52,36 +52,6 @@ jobs:
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: true
swap-storage: true
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Build and push Docker image (versioned)
if: github.event_name == 'push'
uses: docker/build-push-action@v4
@@ -89,7 +59,7 @@ jobs:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
- name: Build and push Docker image (tagged)
@@ -98,5 +68,5 @@ jobs:
context: .
push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
file: ${{ matrix.config.dockerfile }}

View File

@@ -1,55 +0,0 @@
name: Nix aarch64 builds
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
jobs:
nix-build-aarch64:
if: ${{ vars.CACHIX_NAME != '' }}
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install QEMU
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
run: |
sudo apt-get update
sudo apt-get install -y qemu-user-static qemu-system-aarch64
sudo usermod -a -G kvm $USER
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-platforms = aarch64-linux
extra-system-features = nixos-test kvm
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: Set-up cachix to push the results to
uses: cachix/cachix-action@v13
with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: ${{ vars.CACHIX_NAME }}
- name: Show all output paths
run: >
nix run github:nix-community/nix-eval-jobs
-- --gc-roots-dir gcroot
--flake
".#packages.aarch64-linux"
- name: Build
run: >
nix run github:Mic92/nix-fast-build
-- --skip-cached --no-nom
--systems aarch64-linux
--flake
".#checks.aarch64-linux"

View File

@@ -1,71 +0,0 @@
name: Nix CI
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
jobs:
nix-eval:
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: List all flake outputs
run: nix flake show --all-systems
- name: Show all output paths
run: >
nix run github:nix-community/nix-eval-jobs
-- --gc-roots-dir gcroot
--flake
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
nix-build:
if: ${{ vars.CACHIX_NAME != '' }}
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: Set-up cachix to push the results to
uses: cachix/cachix-action@v13
with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: ${{ vars.CACHIX_NAME }}
- name: Build
run: >
nix run github:Mic92/nix-fast-build
-- --skip-cached --no-nom
--flake
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"

View File

@@ -1,22 +0,0 @@
name: update-flake-lock
on:
workflow_dispatch:
schedule:
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
jobs:
lockfile:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@main
- name: Update flake.lock
uses: DeterminateSystems/update-flake-lock@main
with:
pr-title: "nix: update flake.lock"
pr-labels: |
nix
pr-reviewers: philiptaron,SomeoneSerge
token: ${{ secrets.FLAKE_TOKEN }}

View File

@@ -1,36 +0,0 @@
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
name: "Publish a flake to flakestry & flakehub"
on:
push:
tags:
- "*"
workflow_dispatch:
inputs:
tag:
description: "The existing tag to publish"
type: "string"
required: true
jobs:
flakestry-publish:
runs-on: ubuntu-latest
permissions:
id-token: "write"
contents: "read"
steps:
- uses: flakestry/flakestry-publish@main
with:
version: "${{ inputs.tag || github.ref_name }}"
flakehub-publish:
runs-on: "ubuntu-latest"
permissions:
id-token: "write"
contents: "read"
steps:
- uses: "actions/checkout@v4"
with:
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
- uses: "DeterminateSystems/nix-installer-action@main"
- uses: "DeterminateSystems/flakehub-push@main"
with:
visibility: "public"
tag: "${{ inputs.tag }}"

View File

@@ -1,29 +0,0 @@
name: Python check requirements.txt
on:
push:
paths:
- 'scripts/check-requirements.sh'
- 'convert*.py'
- 'requirements.txt'
- 'requirements/*.txt'
pull_request:
paths:
- 'scripts/check-requirements.sh'
- 'convert*.py'
- 'requirements.txt'
- 'requirements/*.txt'
jobs:
python-check-requirements:
runs-on: ubuntu-latest
name: check-requirements
steps:
- name: Check out source repository
uses: actions/checkout@v3
- name: Set up Python environment
uses: actions/setup-python@v4
with:
python-version: "3.11"
- name: Run check-requirements.sh script
run: bash scripts/check-requirements.sh nocleanup

3
.gitignore vendored
View File

@@ -43,16 +43,13 @@ models-mnt
/embedding
/gguf
/gguf-llama-simple
/imatrix
/infill
/libllama.so
/llama-bench
/llava-cli
/lookahead
/lookup
/main
/metal
/passkey
/perplexity
/q8dot
/quantize

View File

@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
cmake_minimum_required(VERSION 3.13) # for add_link_options
project("llama.cpp" C CXX)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -76,10 +76,6 @@ if (NOT MSVC)
option(LLAMA_F16C "llama: enable F16C" ${INS_ENB})
endif()
if (WIN32)
option(LLAMA_WIN_VER "llama: Windows Version" 0x602)
endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
@@ -95,11 +91,9 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access")
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
@@ -159,9 +153,9 @@ if (APPLE AND LLAMA_ACCELERATE)
endif()
if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
message(STATUS "Metal framework found")
set(GGML_HEADERS_METAL ggml-metal.h)
@@ -178,35 +172,6 @@ if (LLAMA_METAL)
# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
if (LLAMA_METAL_SHADER_DEBUG)
# custom command to do the following:
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
# xcrun -sdk macosx metallib ggml-metal.air -o default.metallib
#
# note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
# disabling fast math is needed in order to pass tests/test-backend-ops
# note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
# note: unfortunately, we have to call it default.metallib instead of ggml.metallib
# ref: https://github.com/ggerganov/whisper.cpp/issues/1720
set(XC_FLAGS -fno-fast-math -fno-inline -g)
if (LLAMA_QKK_64)
set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
endif()
add_custom_command(
OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
DEPENDS ggml-metal.metal
COMMENT "Compiling Metal kernels"
)
add_custom_target(
ggml-metal ALL
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
)
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
@@ -234,11 +199,7 @@ if (LLAMA_BLAS)
if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
pkg_check_modules(DepBLAS REQUIRED blas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
# As of openblas v0.3.22, the 64-bit is named openblas64.pc
pkg_check_modules(DepBLAS openblas64)
if (NOT DepBLAS_FOUND)
pkg_check_modules(DepBLAS REQUIRED openblas)
endif()
pkg_check_modules(DepBLAS REQUIRED openblas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
pkg_check_modules(DepBLAS REQUIRED blis)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
@@ -340,8 +301,6 @@ if (LLAMA_CUBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
@@ -418,9 +377,6 @@ if (LLAMA_HIPBLAS)
if (${hipblas_FOUND} AND ${hip_FOUND})
message(STATUS "HIP and hipBLAS found")
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
if (LLAMA_HIP_UMA)
add_compile_definitions(GGML_HIP_UMA)
endif()
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
if (BUILD_SHARED_LIBS)
set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
@@ -594,13 +550,6 @@ if (NOT MSVC)
endif()
endif()
function(add_compile_option_cpp ARG)
# Adds a compile option to C/C++ only, but not for Cuda.
# Use, e.g., for CPU-architecture flags.
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${ARG}>)
add_compile_options($<$<COMPILE_LANGUAGE:C>:${ARG}>)
endfunction()
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
message(STATUS "ARM detected")
if (MSVC)
@@ -635,7 +584,8 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
include(cmake/FindSIMD.cmake)
endif ()
if (LLAMA_AVX512)
add_compile_option_cpp(/arch:AVX512)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
# MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the
# macros corresponding to the extensions.
@@ -649,35 +599,37 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif()
elseif (LLAMA_AVX2)
add_compile_option_cpp(/arch:AVX2)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
elseif (LLAMA_AVX)
add_compile_option_cpp(/arch:AVX)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
endif()
else()
if (LLAMA_NATIVE)
add_compile_option_cpp(-march=native)
add_compile_options(-march=native)
endif()
if (LLAMA_F16C)
add_compile_option_cpp(-mf16c)
add_compile_options(-mf16c)
endif()
if (LLAMA_FMA)
add_compile_option_cpp(-mfma)
add_compile_options(-mfma)
endif()
if (LLAMA_AVX)
add_compile_option_cpp(-mavx)
add_compile_options(-mavx)
endif()
if (LLAMA_AVX2)
add_compile_option_cpp(-mavx2)
add_compile_options(-mavx2)
endif()
if (LLAMA_AVX512)
add_compile_option_cpp(-mavx512f)
add_compile_option_cpp(-mavx512bw)
add_compile_options(-mavx512f)
add_compile_options(-mavx512bw)
endif()
if (LLAMA_AVX512_VBMI)
add_compile_option_cpp(-mavx512vbmi)
add_compile_options(-mavx512vbmi)
endif()
if (LLAMA_AVX512_VNNI)
add_compile_option_cpp(-mavx512vnni)
add_compile_options(-mavx512vnni)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
@@ -694,7 +646,7 @@ endif()
if (MINGW)
# Target Windows 8 for PrefetchVirtualMemory
add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
add_compile_definitions(_WIN32_WINNT=0x602)
endif()
#

View File

@@ -1,8 +1,8 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead tests/test-c.o
# Binaries only useful for tests
TEST_TARGETS = \
@@ -43,6 +43,10 @@ ifeq ($(UNAME_S),Darwin)
endif
endif
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
BUILD_TARGETS += metal
endif
default: $(BUILD_TARGETS)
test: $(TEST_TARGETS)
@@ -61,7 +65,7 @@ test: $(TEST_TARGETS)
./$$test_target; \
fi; \
if [ $$? -ne 0 ]; then \
printf 'Test %s FAILED!\n\n' $$test_target; \
printf 'Test $$test_target FAILED!\n\n' $$test_target; \
failures=$$(( failures + 1 )); \
else \
printf 'Test %s passed.\n\n' $$test_target; \
@@ -278,17 +282,8 @@ endif
ifneq ($(filter aarch64%,$(UNAME_M)),)
# Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit)
# Nvidia Jetson
MK_CFLAGS += -mcpu=native
MK_CXXFLAGS += -mcpu=native
JETSON_RELEASE_INFO = $(shell jetson_release)
ifdef JETSON_RELEASE_INFO
ifneq ($(filter TX2%,$(JETSON_RELEASE_INFO)),)
JETSON_EOL_MODULE_DETECT = 1
CC = aarch64-unknown-linux-gnu-gcc
cxx = aarch64-unknown-linux-gnu-g++
endif
endif
endif
ifneq ($(filter armv6%,$(UNAME_M)),)
@@ -362,16 +357,15 @@ ifdef LLAMA_BLIS
endif # LLAMA_BLIS
ifdef LLAMA_CUBLAS
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o
MK_NVCCFLAGS = -use_fast_math
ifndef JETSON_EOL_MODULE_DETECT
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
endif # JETSON_EOL_MODULE_DETECT
MK_NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
ifdef LLAMA_DEBUG
MK_NVCCFLAGS += -lineinfo
endif # LLAMA_DEBUG
endif
ifdef LLAMA_CUDA_NVCC
NVCC = $(LLAMA_CUDA_NVCC)
else
@@ -423,11 +417,7 @@ ifdef LLAMA_CUDA_CCBIN
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
ifdef JETSON_EOL_MODULE_DETECT
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
else
$(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endif # JETSON_EOL_MODULE_DETECT
endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST
@@ -462,9 +452,6 @@ ifdef LLAMA_HIPBLAS
LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
ifdef LLAMA_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA
endif # LLAMA_HIP_UMA
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
@@ -610,9 +597,6 @@ quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -622,7 +606,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
@@ -661,11 +645,10 @@ parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ifdef LLAMA_METAL
metal: examples/metal/metal.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
endif
ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift

View File

@@ -13,17 +13,21 @@ let package = Package(
products: [
.library(name: "llama", targets: ["llama"]),
],
dependencies: [
.package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
],
targets: [
.target(
name: "llama",
dependencies: ["ggml"],
path: ".",
exclude: ["ggml-metal.metal"],
exclude: [],
sources: [
"ggml.c",
"llama.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",
"ggml-metal.m",
],
resources: [
.process("ggml-metal.metal")
],
publicHeadersPath: "spm-headers",
cSettings: [

View File

@@ -10,7 +10,6 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
### Hot topics
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
- Collecting Apple Silicon performance stats:
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
- A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
@@ -103,8 +102,6 @@ as the main playground for developing new features for the [ggml](https://github
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
- [x] [GPT-2](https://huggingface.co/gpt2)
**Multimodal models:**
@@ -119,7 +116,6 @@ as the main playground for developing new features for the [ggml](https://github
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
@@ -127,7 +123,6 @@ as the main playground for developing new features for the [ggml](https://github
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
**UI:**
@@ -136,8 +131,6 @@ as the main playground for developing new features for the [ggml](https://github
- [withcatai/catai](https://github.com/withcatai/catai)
- [semperai/amica](https://github.com/semperai/amica)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [iohub/collama](https://github.com/iohub/coLLaMA)
---
@@ -388,37 +381,20 @@ Building the program with BLAS support may lead to some performance improvements
Check [BLIS.md](docs/BLIS.md) for more information.
- #### Intel oneMKL
- Using manual oneAPI installation:
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
```bash
mkdir build
cd build
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-runtime docker image, only required for manual installation
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
cmake --build . --config Release
```
- #### Intel MKL
- Using oneAPI docker image:
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-runtime](https://hub.docker.com/r/intel/oneapi-runtime)
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
```bash
mkdir build
cd build
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
cmake --build . --config Release
```
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni.
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
```bash
mkdir build
cd build
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
cmake --build . --config Release
```
- #### cuBLAS
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
- Using `make`:
```bash
make LLAMA_CUBLAS=1
@@ -456,21 +432,14 @@ Building the program with BLAS support may lead to some performance improvements
```bash
make LLAMA_HIPBLAS=1
```
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
- Using `CMake` for Linux:
```bash
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \
cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build -- -j 16
mkdir build
cd build
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
cmake --build .
```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
- Using `make` (example for target gfx1030, build with 16 CPU threads):
```bash
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
```
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS):
```bash
set PATH=%HIP_PATH%\bin;%PATH%
mkdir build
@@ -479,11 +448,10 @@ Building the program with BLAS support may lead to some performance improvements
cmake --build .
```
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
| Option | Legal values | Default | Description |
@@ -1014,8 +982,6 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT`
### Docs

View File

@@ -1,116 +0,0 @@
# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp
[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)]
**Supported models:**
- [X] LLaMA
- [x] LLaMA 2
- [X] MPT
- [X] Mistral AI v0.1
- [ ] Bloom
- [ ] Mixtral MoE
**TODO:**
- [x] Update version work with both MPT and MPT-AWQ model
- [ ] Add OPT model
- [ ] Add Bloom model
- [ ] Add Mixtral MoE
- [ ] Support w3, w2
## Contents
- [Install](##Install)
- [Convert](##Convert)
- [Quantize](##Quantize)
- [Test](##Test)
- [Benchmark](##Benchmark)
- [Results](##Results)
## Install
Install requirements
```bash
pip install -r requirements.txt
```
Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT
```bash
git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache
```
## Convert
Example for llama model
```bash
# For llama7b and llama2 models
python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
# For mistral and mpt models
python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
```
## Quantize
```bash
# We only benchmark and confirm the results on q4_0, q4_1, and q2_k types.
./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0
```
## Test
```bash
# For all models.
./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time"
```
## Benchmark
The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
```bash
# For llama and llama2, and mistral models.
./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw
```
## Results
Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k
### Llama 7B (Build with OpenBLAS)
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|-----------:|--------------|-------:|-------:|-------:|-------:|
|Llama 7B | perplexity | 5.9066 | 6.1214 | 6.0643 | 6.5808 |
|Llama 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G |
|Llama 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|AWQ-LLama 7B| perplexity | 5.9175 | 6.0252 | 5.9987 | 6.3692 |
|AWQ-LLama 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G |
|AWQ-LLama 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
### Llama2 7B (Build with CuBLAS)
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|------------:|--------------|-------:|-------:|-------:|-------:|
|Llama2 7B | perplexity | 5.8664 | 6.0260 | 6.0656 | 6.4496 |
|Llama2 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G |
|Llama2 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|AWQ-LLama2 7B| perplexity | 5.8801 | 6.0054 | 5.9849 | 6.3650 |
|AWQ-LLama2 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G |
|AWQ-LLama2 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
### Mistral 7B v0.1 (Build with CuBLAS)
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|-------------:|--------------|-------:|-------:|-------:|-------:|
|Mistral 7B | perplexity | 5.6931 | 5.8202 | 5.8268 | 6.1645 |
|Mistral 7B | file size | 14.5G | 4.1G | 4.5G | 3.1G |
|Mistral 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|AWQ-Mistral 7B| perplexity | 5.6934 | 5.8020 | 5.7691 | 6.0426 |
|AWQ-Mistral 7B| file size | 14.5G | 4.1G | 4.5G | 3.1G |
|AWQ-Mistral 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
### MPT 7B (Build with OpenBLAS)
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|---------:|--------------|-------:|-------:|-------:|--------:|
|MPT 7B | perplexity | 8.4369 | 8.7956 | 8.6265 | 11.4913 |
|MPT 7B | file size | 13.7G | 3.9G | 4.3G | 2.8G |
|MPT 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|AWQ-MPT 7B| perplexity | 8.4944 | 8.7053 | 8.6750 | 10.2873|
|AWQ-MPT 7B| file size | 13.7G | 3.9G | 4.3G | 2.8G |
|AWQ-MPT 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |

View File

@@ -1,254 +0,0 @@
"""
Implements the AWQ for llama.cpp use cases.
Original paper: https://arxiv.org/abs/2306.00978
This code is based on versions of the AWQ implementation found in the following repositories:
* https://github.com/mit-han-lab/llm-awq
* https://github.com/casper-hansen/AutoAWQ
"""
import os
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoConfig
from transformers.models.bloom.modeling_bloom import BloomGelu
from transformers.models.llama.modeling_llama import LlamaRMSNorm
from transformers.activations import GELUActivation
class ScaledActivation(nn.Module):
"""
ScaledActivation module wraps an existing activation function and applies a
scale factor to its output.
Args:
module (nn.Module): The activation function to be scaled.
scales (torch.Tensor): A tensor of size (num_features,) containing the initial
scale factors for each feature.
Returns:
torch.Tensor: The scaled output of the activation function.
"""
def __init__(self, module, scales):
super().__init__()
self.act = module
self.scales = nn.Parameter(scales.data)
def forward(self, x):
return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
def set_op_by_name(layer, name, new_module):
"""
Set the new module for given module's name.
Args:
layer (nn.Module): The layer in which to replace the submodule.
name (str): The path to the submodule to be replaced, using dot notation
to access nested modules.
new_module (nn.Module): The new module to replace the existing one.
"""
levels = name.split(".")
if len(levels) > 1:
mod_ = layer
for l_idx in range(len(levels) - 1):
if levels[l_idx].isdigit():
mod_ = mod_[int(levels[l_idx])]
else:
mod_ = getattr(mod_, levels[l_idx])
setattr(mod_, levels[-1], new_module)
else:
setattr(layer, name, new_module)
def get_op_by_name(module, op_name):
"""
Retrieves a submodule within a given layer based on its name.
Args:
module (nn.Module): The layer containing the submodule to find.
op_name (str): The name of the submodule.
Returns:
nn.Module: The requested submodule found within the given layer.
Raises:
ValueError: If the specified submodule cannot be found within the layer.
"""
for name, m in module.named_modules():
if name == op_name:
return m
raise ValueError(f"Cannot find op {op_name} in module {module}")
@torch.no_grad()
def scale_ln_fcs(ln, fcs, scales):
"""
Scales the weights of a LayerNorm and a list of fully-connected layers proportionally.
Args:
ln (nn.LayerNorm): The LayerNorm module to be scaled.
fcs (List[nn.Linear]): A list of fully-connected layers to be scaled.
scales (torch.Tensor): A 1D tensor of size (num_features,).
"""
if not isinstance(fcs, list):
fcs = [fcs]
scales = scales.to(ln.weight.device)
ln.weight.div_(scales)
if hasattr(ln, "bias") and ln.bias is not None:
ln.bias.div_(scales)
for fc in fcs:
fc.weight.mul_(scales.view(1, -1))
for p in ln.parameters():
assert torch.isnan(p).sum() == 0
for fc in fcs:
for p in fc.parameters():
assert torch.isnan(p).sum() == 0
@torch.no_grad()
def scale_fc_fc(fc1, fc2, scales):
"""
Scales the weights of two fully-connected layers in a specific pattern.
Args:
fc1 (nn.Linear): The first fully-connected layer to be scaled.
fc2 (nn.Linear): The second fully-connected layer to be scaled.
scales (torch.Tensor): A 1D tensor of size (num_features,).
"""
assert isinstance(fc1, nn.Linear)
assert isinstance(fc2, nn.Linear)
scales = scales.to(fc1.weight.device)
fc1.weight[-scales.size(0):].div_(scales.view(-1, 1))
if fc1.bias is not None:
fc1.bias.div_(scales.view(-1))
fc2.weight.mul_(scales.view(1, -1))
for p in fc1.parameters():
assert torch.isnan(p).sum() == 0
for p in fc2.parameters():
assert torch.isnan(p).sum() == 0
@torch.no_grad()
def scale_gelu_fc(gelu, fc, scales):
"""
Scales the weight of a GELU activation and a fully-connected layer proportionally.
Args:
gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled.
fc (nn.Linear): The fully-connected layer to be scaled.
scales (torch.Tensor): A 1D tensor of size (num_features,).
Raises:
TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`.
TypeError: If the `fc` module is not of type `nn.Linear`.
"""
assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation))
assert isinstance(fc, nn.Linear)
fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
for p in fc.parameters():
assert torch.isnan(p).sum() == 0
def apply_scale(module, scales_list, input_feat_dict=None):
"""
Applies different scaling strategies to layers based on their type and hierarchy within a given module.
Args:
module (nn.Module): The module containing the layers to be scaled.
scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing:
* prev_op_name (str): The name of the preceding operation or module,
relative to which the layers to be scaled are located.
* layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation.
* scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature.
input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding
input features (optional).
"""
for prev_op_name, layer_names, scales in scales_list:
prev_op = get_op_by_name(module, prev_op_name)
layers = [get_op_by_name(module, name) for name in layer_names]
prev_op.cuda()
for layer in layers:
layer.cuda()
scales.cuda()
if isinstance(prev_op, nn.Linear):
assert len(layers) == 1
scale_fc_fc(prev_op, layers[0], scales)
elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower():
scale_ln_fcs(prev_op, layers, scales)
elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)):
new_module = ScaledActivation(prev_op, scales)
set_op_by_name(module, prev_op_name, new_module)
scale_gelu_fc(prev_op, layers[0], scales)
else:
raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!")
# apply the scaling to input feat if given; prepare it for clipping
if input_feat_dict is not None:
for layer_name in layer_names:
inp = input_feat_dict[layer_name]
inp.div_(scales.view(1, -1).to(inp.device))
prev_op.cpu()
for layer in layers:
layer.cpu()
scales.cpu()
@torch.no_grad()
def apply_clip(module, clip_list):
"""
Applies element-wise clipping to the weight of a specific layer within a given module.
Args:
module (nn.Module): The module containing the layer to be clipped.
clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing:
* name (str): The name of the layer to be clipped, relative to the root of the module.
* max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight.
"""
for name, max_val in clip_list:
layer = get_op_by_name(module, name)
layer.cuda()
max_val = max_val.to(layer.weight.device)
org_shape = layer.weight.shape
layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
layer.weight.data = layer.weight.data.reshape(org_shape)
layer.cpu()
def add_scale_weights(model_path, scale_path, tmp_path):
"""
Adds pre-computed Activation Weight Quantization (AWQ) results to a model,
including scaling factors and clipping bounds.
Args:
model_path (str): Path to the pre-trained model to be equipped with AWQ.
scale_path (str): Path to the AWQ scale factors (.pt file).
tmp_path (str): Path to the temporary directory where the equipped model will be saved.
"""
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path, config=config, trust_remote_code=True
)
model.eval()
awq_results = torch.load(str(scale_path), map_location="cpu")
apply_scale(model, awq_results["scale"])
apply_clip(model, awq_results["clip"])
model.save_pretrained(str(tmp_path))
os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}")

View File

@@ -1,2 +0,0 @@
torch>=2.1.1
transformers>=4.32.0

View File

@@ -30,12 +30,6 @@ sd=`dirname $0`
cd $sd/../
SRC=`pwd`
CMAKE_EXTRA=""
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
fi
## helpers
# download a file if it does not exist or if it is outdated
@@ -87,8 +81,8 @@ function gg_run_ctest_debug {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@@ -115,8 +109,8 @@ function gg_run_ctest_release {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log

View File

@@ -65,4 +65,4 @@ endif()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
target_link_libraries(${TARGET} PRIVATE llama build_info)

View File

@@ -167,24 +167,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
if (params.n_threads_batch <= 0) {
params.n_threads_batch = std::thread::hardware_concurrency();
}
} else if (arg == "-td" || arg == "--threads-draft") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_threads_draft = std::stoi(argv[i]);
if (params.n_threads_draft <= 0) {
params.n_threads_draft = std::thread::hardware_concurrency();
}
} else if (arg == "-tbd" || arg == "--threads-batch-draft") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_threads_batch_draft = std::stoi(argv[i]);
if (params.n_threads_batch_draft <= 0) {
params.n_threads_batch_draft = std::thread::hardware_concurrency();
}
} else if (arg == "-p" || arg == "--prompt") {
if (++i >= argc) {
invalid_param = true;
@@ -238,20 +220,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.n_ctx = std::stoi(argv[i]);
} else if (arg == "--grp-attn-n" || arg == "-gan") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.grp_attn_n = std::stoi(argv[i]);
} else if (arg == "--grp-attn-w" || arg == "-gaw") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.grp_attn_w = std::stoi(argv[i]);
} else if (arg == "--rope-freq-base") {
if (++i >= argc) {
invalid_param = true;
@@ -561,8 +529,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params.n_gpu_layers = std::stoi(argv[i]);
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
@@ -571,8 +540,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params.n_gpu_layers_draft = std::stoi(argv[i]);
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
@@ -581,44 +551,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
params.main_gpu = std::stoi(argv[i]);
#ifndef GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--split-mode" || arg == "-sm") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::string arg_next = argv[i];
if (arg_next == "none") {
params.split_mode = LLAMA_SPLIT_NONE;
} else if (arg_next == "layer") {
params.split_mode = LLAMA_SPLIT_LAYER;
} else if (arg_next == "row") {
params.split_mode = LLAMA_SPLIT_ROW;
} else {
invalid_param = true;
break;
}
#ifndef GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUBLAS
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
#endif
} else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
std::string arg_next = argv[i];
// split string by , and /
const std::regex regex{R"([,/]+)"};
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
std::vector<std::string> split_arg{it, {}};
if (split_arg.size() >= LLAMA_MAX_DEVICES) {
invalid_param = true;
break;
}
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
if (i < split_arg.size()) {
params.tensor_split[i] = std::stof(split_arg[i]);
@@ -626,8 +577,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.tensor_split[i] = 0.0f;
}
}
#ifndef GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
#ifdef GGML_USE_CUBLAS
params.mul_mat_q = false;
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--no-mmap") {
params.use_mmap = false;
@@ -635,8 +592,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.numa = true;
} else if (arg == "--verbose-prompt") {
params.verbose_prompt = true;
} else if (arg == "--no-display-prompt") {
params.display_prompt = false;
} else if (arg == "-r" || arg == "--reverse-prompt") {
if (++i >= argc) {
invalid_param = true;
@@ -661,12 +616,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.ppl_stride = std::stoi(argv[i]);
} else if (arg == "-ptc" || arg == "--print-token-count") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_print = std::stoi(argv[i]);
} else if (arg == "--ppl-output-type") {
if (++i >= argc) {
invalid_param = true;
@@ -849,7 +798,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf("\n");
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" --version show version and build info\n");
printf(" --version show version and build info\n");
printf(" -i, --interactive run in interactive mode\n");
printf(" --interactive-first run in interactive mode and wait for input right away\n");
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
@@ -863,10 +812,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
printf(" -tb N, --threads-batch N\n");
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
printf(" -td N, --threads-draft N");
printf(" number of threads to use during generation (default: same as --threads)");
printf(" -tbd N, --threads-batch-draft N\n");
printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
printf(" -p PROMPT, --prompt PROMPT\n");
printf(" prompt to start generation with (default: empty)\n");
printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
@@ -950,22 +895,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" number of layers to store in VRAM\n");
printf(" -ngld N, --n-gpu-layers-draft N\n");
printf(" number of layers to store in VRAM for the draft model\n");
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
printf(" how to split the model across multiple GPUs, one of:\n");
printf(" - none: use one GPU only\n");
printf(" - layer (default): split layers and KV across GPUs\n");
printf(" - row: split rows across GPUs\n");
printf(" -ts SPLIT, --tensor-split SPLIT\n");
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
printf(" -ts SPLIT --tensor-split SPLIT\n");
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
#ifdef GGML_USE_CUBLAS
printf(" -nommq, --no-mul-mat-q\n");
printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif // GGML_USE_CUBLAS
#endif
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
printf(" -gan N, --grp-attn-n N\n");
printf(" group-attention factor (default: %d)\n", params.grp_attn_n);
printf(" -gaw N, --grp-attn-w N\n");
printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
printf(" --verbose-prompt print prompt before generation\n");
printf(" -dkvc, --dump-kv-cache\n");
printf(" verbose print of the KV cache\n");
printf(" -nkvo, --no-kv-offload\n");
@@ -987,8 +926,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -ptc N, --print-token-count N\n");
printf(" print token count every N tokens (default: %d)\n", params.n_print);
printf("\n");
#ifndef LOG_DISABLE_LOGS
log_print_usage();
@@ -1078,7 +1015,6 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
@@ -1093,9 +1029,6 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
}
static ggml_type kv_cache_type_from_str(const std::string & s) {
if (s == "f32") {
return GGML_TYPE_F32;
}
if (s == "f16") {
return GGML_TYPE_F16;
}
@@ -1461,7 +1394,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
@@ -1607,7 +1539,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
}
//

View File

@@ -46,14 +46,12 @@ struct gpt_params {
uint32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 8; // number of tokens to draft during speculative decoding
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
@@ -61,13 +59,9 @@ struct gpt_params {
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width.
int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
@@ -128,7 +122,6 @@ struct gpt_params {
bool use_mlock = false; // use mlock to keep model in memory
bool numa = false; // attempt optimizations that help on some NUMA systems
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading

View File

@@ -149,12 +149,11 @@ static void sampler_queue(
}
}
static llama_token llama_sampling_sample_impl(
llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx,
bool is_resampling) { // Add a parameter to indicate if we are resampling
const int idx) {
const llama_sampling_params & params = ctx_sampling->params;
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
@@ -174,27 +173,13 @@ static llama_token llama_sampling_sample_impl(
llama_token id = 0;
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);
// Declare original_logits at the beginning of the function scope
std::vector<float> original_logits;
if (!is_resampling) {
// Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this.
original_logits = std::vector<float>(logits, logits + llama_n_vocab(llama_get_model(ctx_main)));
}
// apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
if (ctx_cfg) {
float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
}
cur.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@@ -203,15 +188,17 @@ static llama_token llama_sampling_sample_impl(
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
if (ctx_cfg) {
llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale);
}
// apply penalties
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
if (penalty_tokens_used_size) {
if (!prev.empty()) {
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
llama_sample_repetition_penalties(ctx_main, &cur_p,
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
prev.data() + prev.size() - penalty_last_n,
penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
@@ -223,8 +210,7 @@ static llama_token llama_sampling_sample_impl(
}
}
// If we are in the resampling phase, apply grammar checks before sampling logic
if (is_resampling && ctx_sampling->grammar != NULL) {
if (ctx_sampling->grammar != NULL) {
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
}
@@ -266,40 +252,9 @@ static llama_token llama_sampling_sample_impl(
}
}
if (ctx_sampling->grammar != NULL && !is_resampling) {
// Create an array with a single token data element for the sampled id
llama_token_data single_token_data = {id, logits[id], 0.0f};
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
// Apply grammar constraints to the single token
llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
// If the token is not valid according to the grammar, perform resampling
if (!is_valid) {
LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
// Restore logits from the copy
std::copy(original_logits.begin(), original_logits.end(), logits);
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling
}
}
return id;
}
llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx) {
// Call the implementation function with is_resampling set to false by default
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
}
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,

View File

@@ -36,9 +36,6 @@ typedef struct llama_sampling_params {
float cfg_scale = 1.f; // how strong is guidance
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
std::vector<llama_token> penalty_prompt_tokens;
bool use_penalty_prompt_tokens = false;
} llama_sampling_params;
// general sampler context

View File

@@ -1107,7 +1107,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
fprintf(stderr, " --sample-start STR Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
fprintf(stderr, " --include-sample-start Include the sample start in the samples. (default off)\n");
fprintf(stderr, " --escape process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
fprintf(stderr, " --overlapping-samples Samples may overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
fprintf(stderr, " --overlapping-samples Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
fprintf(stderr, " --fill-with-next-samples Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
fprintf(stderr, " --separate-with-eos When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
fprintf(stderr, " --separate-with-bos When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");

View File

@@ -23,15 +23,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
import gguf
# check for any of the given keys in the dictionary and return the value of the first key found
def get_key_opts(d, keys):
for k in keys:
if k in d:
return d[k]
print(f"Could not find any of {keys}")
sys.exit()
###### MODEL DEFINITIONS ######
class SentencePieceTokenTypes(IntEnum):
@@ -55,7 +46,7 @@ class Model:
self.part_names = self._get_part_names()
self.hparams = Model.load_hparams(self.dir_model)
self.model_arch = self._get_model_architecture()
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess)
def set_vocab(self):
self._set_vocab_gpt2()
@@ -191,12 +182,8 @@ class Model:
return QwenModel
if model_architecture == "MixtralForCausalLM":
return MixtralModel
if model_architecture == "GPT2LMHeadModel":
return GPT2Model
if model_architecture == "PhiForCausalLM":
return Phi2Model
if model_architecture == "PlamoForCausalLM":
return PlamoModel
return Model
def _is_model_safetensors(self) -> bool:
@@ -236,12 +223,8 @@ class Model:
return gguf.MODEL_ARCH.QWEN
if arch == "MixtralForCausalLM":
return gguf.MODEL_ARCH.LLAMA
if arch == "GPT2LMHeadModel":
return gguf.MODEL_ARCH.GPT2
if arch == "PhiForCausalLM":
return gguf.MODEL_ARCH.PHI2
if arch == "PlamoForCausalLM":
return gguf.MODEL_ARCH.PLAMO
raise NotImplementedError(f'Architecture "{arch}" not supported!')
@@ -251,7 +234,7 @@ class Model:
tokens: list[bytearray] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
from transformers import AutoTokenizer # type: ignore[attr-defined]
tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
@@ -266,11 +249,10 @@ class Model:
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
if hasattr(tokenizer, "added_tokens_decoder"):
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
@@ -478,11 +460,7 @@ class MPTModel(Model):
data = data_torch.squeeze().numpy()
# map tensor names
if "scales" in name:
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
new_name = new_name.replace("scales", "act.scales")
else:
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
@@ -827,17 +805,10 @@ class PersimmonModel(Model):
hidden_size = self.hparams["hidden_size"]
self.gguf_writer.add_name('persimmon-8b-chat')
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hidden_size)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
# NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
# than the head size?
# ref: https://github.com/ggerganov/llama.cpp/pull/4889
# self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
self.gguf_writer.add_head_count(head_count)
self.gguf_writer.add_head_count_kv(head_count_kv)
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
@@ -873,7 +844,7 @@ class StableLMModel(Model):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(dir_model.name)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
@@ -919,7 +890,7 @@ class QwenModel(Model):
tokens: list[bytearray] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
from transformers import AutoTokenizer # type: ignore[attr-defined]
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams["vocab_size"]
assert max(tokenizer.get_vocab().values()) < vocab_size
@@ -1014,182 +985,32 @@ class QwenModel(Model):
self.gguf_writer.add_tensor(new_name, data)
class GPT2Model(Model):
def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_block_count(self.hparams["n_layer"])
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
self.gguf_writer.add_head_count(self.hparams["n_head"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_file_type(self.ftype)
def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
for name, data_torch in self.get_tensors():
# we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias")):
continue
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
data_torch = data_torch.transpose(1, 0)
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
data = data_torch.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
# note: GPT2 output is tied to (same as) wte in original model
if new_name == "token_embd.weight":
print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor("output.weight", data)
class Phi2Model(Model):
def set_gguf_parameters(self):
block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"])
rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"])
n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"])
block_count = self.hparams["n_layer"]
self.gguf_writer.add_name("Phi2")
self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"]))
self.gguf_writer.add_embedding_length(n_embd)
self.gguf_writer.add_feed_forward_length(4 * n_embd)
self.gguf_writer.add_context_length(self.hparams["n_positions"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(n_head)
self.gguf_writer.add_head_count_kv(n_head)
self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"]))
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
self.gguf_writer.add_head_count(self.hparams["n_head"])
self.gguf_writer.add_head_count_kv(self.hparams["n_head"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"])
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_add_bos_token(False)
class PlamoModel(Model):
def set_vocab(self):
self._set_vocab_sentencepiece()
def set_gguf_parameters(self):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
self.gguf_writer.add_name("PLaMo")
self.gguf_writer.add_context_length(4096) # not in config.json
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
def shuffle_attn_q_weight(self, data_torch):
assert data_torch.size() == (5120, 5120)
data_torch = data_torch.reshape(8, 5, 128, 5120)
data_torch = torch.permute(data_torch, (1, 0, 2, 3))
data_torch = torch.reshape(data_torch, (5120, 5120))
return data_torch
def shuffle_attn_output_weight(self, data_torch):
assert data_torch.size() == (5120, 5120)
data_torch = data_torch.reshape(5120, 8, 5, 128)
data_torch = torch.permute(data_torch, (0, 2, 1, 3))
data_torch = torch.reshape(data_torch, (5120, 5120))
return data_torch
def write_tensors(self):
block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
for name, data_torch in self.get_tensors():
if "self_attn.rotary_emb.inv_freq" in name:
continue
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
# shuffle for broadcasting of gqa in ggml_mul_mat
if new_name.endswith("attn_q.weight"):
data_torch = self.shuffle_attn_q_weight(data_torch)
elif new_name.endswith("attn_output.weight"):
data_torch = self.shuffle_attn_output_weight(data_torch)
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
data = data_torch.squeeze().numpy()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
###### CONVERSION LOGIC ######
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Convert a huggingface model to a GGML compatible file")
parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--awq-path", type=Path, default=None,
help="Path to scale awq cache file")
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
@@ -1207,62 +1028,43 @@ def parse_args() -> argparse.Namespace:
return parser.parse_args()
def main() -> None:
args = parse_args()
args = parse_args()
dir_model = args.model
dir_model = args.model
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file=sys.stderr)
sys.exit(1)
if args.awq_path:
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
from awq.apply_awq import add_scale_weights
tmp_model_path = args.model / "weighted_model"
dir_model = tmp_model_path
if tmp_model_path.is_dir():
print(f"{tmp_model_path} exists as a weighted model.")
else:
tmp_model_path.mkdir(parents=True, exist_ok=True)
print("Saving new weighted model ...")
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
print(f"Saved weighted model at {tmp_model_path}.")
ftype_map = {
"f32": gguf.GGMLQuantizationType.F32,
"f16": gguf.GGMLQuantizationType.F16,
}
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file=sys.stderr)
sys.exit(1)
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
ftype_map = {
"f32": gguf.GGMLQuantizationType.F32,
"f16": gguf.GGMLQuantizationType.F16,
}
print(f"Loading model: {dir_model.name}")
if args.outfile is not None:
fname_out = args.outfile
hparams = Model.load_hparams(dir_model)
with torch.inference_mode():
model_class = Model.from_model_architecture(hparams["architectures"][0])
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
print("Set model parameters")
model_instance.set_gguf_parameters()
print("Set model tokenizer")
model_instance.set_vocab()
if args.vocab_only:
print(f"Exporting model vocab to '{fname_out}'")
model_instance.write_vocab()
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
print(f"Exporting model to '{fname_out}'")
model_instance.write()
print(f"Loading model: {dir_model.name}")
hparams = Model.load_hparams(dir_model)
with torch.inference_mode():
model_class = Model.from_model_architecture(hparams["architectures"][0])
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
print("Set model parameters")
model_instance.set_gguf_parameters()
print("Set model tokenizer")
model_instance.set_vocab()
if args.vocab_only:
print(f"Exporting model vocab to '{fname_out}'")
model_instance.write_vocab()
else:
print(f"Exporting model to '{fname_out}'")
model_instance.write()
print(f"Model successfully exported to '{fname_out}'")
if __name__ == '__main__':
main()
print(f"Model successfully exported to '{fname_out}'")

View File

@@ -47,96 +47,95 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty
fout.seek((fout.tell() + 31) & -32)
if __name__ == '__main__':
if len(sys.argv) < 2:
print(f"Usage: python {sys.argv[0]} <path> [arch]")
print(
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
)
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
sys.exit(1)
if len(sys.argv) < 2:
print(f"Usage: python {sys.argv[0]} <path> [arch]")
print(
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
)
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
sys.exit(1)
input_json = os.path.join(sys.argv[1], "adapter_config.json")
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
input_json = os.path.join(sys.argv[1], "adapter_config.json")
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
model = torch.load(input_model, map_location="cpu")
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
model = torch.load(input_model, map_location="cpu")
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
print(f"Error: unsupported architecture {arch_name}")
sys.exit(1)
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
print(f"Error: unsupported architecture {arch_name}")
sys.exit(1)
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
with open(input_json, "r") as f:
params = json.load(f)
with open(input_json, "r") as f:
params = json.load(f)
if params["peft_type"] != "LORA":
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
sys.exit(1)
if params["peft_type"] != "LORA":
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
sys.exit(1)
if params["fan_in_fan_out"] is True:
print("Error: param fan_in_fan_out is not supported")
sys.exit(1)
if params["fan_in_fan_out"] is True:
print("Error: param fan_in_fan_out is not supported")
sys.exit(1)
if params["bias"] is not None and params["bias"] != "none":
print("Error: param bias is not supported")
sys.exit(1)
if params["bias"] is not None and params["bias"] != "none":
print("Error: param bias is not supported")
sys.exit(1)
# TODO: these seem to be layers that have been trained but without lora.
# doesn't seem widely used but eventually should be supported
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
print("Error: param modules_to_save is not supported")
sys.exit(1)
# TODO: these seem to be layers that have been trained but without lora.
# doesn't seem widely used but eventually should be supported
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
print("Error: param modules_to_save is not supported")
sys.exit(1)
with open(output_path, "wb") as fout:
fout.truncate()
with open(output_path, "wb") as fout:
fout.truncate()
write_file_header(fout, params)
for k, v in model.items():
orig_k = k
if k.endswith(".default.weight"):
k = k.replace(".default.weight", ".weight")
if k in ["llama_proj.weight", "llama_proj.bias"]:
continue
if k.endswith("lora_A.weight"):
if v.dtype != torch.float16 and v.dtype != torch.float32:
v = v.float()
v = v.T
else:
write_file_header(fout, params)
for k, v in model.items():
orig_k = k
if k.endswith(".default.weight"):
k = k.replace(".default.weight", ".weight")
if k in ["llama_proj.weight", "llama_proj.bias"]:
continue
if k.endswith("lora_A.weight"):
if v.dtype != torch.float16 and v.dtype != torch.float32:
v = v.float()
v = v.T
else:
v = v.float()
t = v.detach().numpy()
t = v.detach().numpy()
prefix = "base_model.model."
if k.startswith(prefix):
k = k[len(prefix) :]
prefix = "base_model.model."
if k.startswith(prefix):
k = k[len(prefix) :]
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
if k.endswith(lora_suffixes):
suffix = k[-len(lora_suffixes[0]):]
k = k[: -len(lora_suffixes[0])]
else:
print(f"Error: unrecognized tensor name {orig_k}")
sys.exit(1)
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
if k.endswith(lora_suffixes):
suffix = k[-len(lora_suffixes[0]):]
k = k[: -len(lora_suffixes[0])]
else:
print(f"Error: unrecognized tensor name {orig_k}")
sys.exit(1)
tname = name_map.get_name(k)
if tname is None:
print(f"Error: could not map tensor name {orig_k}")
print(" Note: the arch parameter must be specified if the model is not llama")
sys.exit(1)
tname = name_map.get_name(k)
if tname is None:
print(f"Error: could not map tensor name {orig_k}")
print(" Note: the arch parameter must be specified if the model is not llama")
sys.exit(1)
if suffix == ".lora_A.weight":
tname += ".weight.loraA"
elif suffix == ".lora_B.weight":
tname += ".weight.loraB"
else:
assert False
if suffix == ".lora_A.weight":
tname += ".weight.loraA"
elif suffix == ".lora_B.weight":
tname += ".weight.loraB"
else:
assert False
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
write_tensor_header(fout, tname, t.shape, t.dtype)
t.tofile(fout)
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
write_tensor_header(fout, tname, t.shape, t.dtype)
t.tofile(fout)
print(f"Converted {input_json} and {input_model} to {output_path}")
print(f"Converted {input_json} and {input_model} to {output_path}")

1
convert-persimmon-to-gguf.py Executable file → Normal file
View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python3
import torch
import os
from pprint import pprint

File diff suppressed because it is too large Load Diff

View File

@@ -31,12 +31,12 @@ else()
add_subdirectory(quantize-stats)
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(passkey)
add_subdirectory(speculative)
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(train-text-from-scratch)
add_subdirectory(imatrix)
if (LLAMA_METAL)
add_subdirectory(metal)
endif()
if (LLAMA_BUILD_SERVER)
add_subdirectory(server)
endif()

View File

@@ -575,7 +575,10 @@ static struct ggml_tensor * forward(
// KQ_scaled = KQ / sqrt(n_embd/n_head)
// KQ_scaled shape [n_past + N, N, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
// KQ_masked = mask_past(KQ_scaled)
// KQ_masked shape [n_past + N, N, n_head, 1]
@@ -841,7 +844,10 @@ static struct ggml_tensor * forward_batch(
// KQ_scaled = KQ / sqrt(n_embd/n_head)
// KQ_scaled shape [n_past + N, N, n_head, n_batch]
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
// KQ_masked = mask_past(KQ_scaled)
@@ -1125,7 +1131,10 @@ static struct ggml_tensor * forward_lora(
// KQ_scaled = KQ / sqrt(n_embd/n_head)
// KQ_scaled shape [n_past + N, N, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
// KQ_masked = mask_past(KQ_scaled)
// KQ_masked shape [n_past + N, N, n_head, 1]

View File

@@ -1,61 +0,0 @@
#!/bin/bash
#
# Few-shot translation example.
# Requires a base model (i.e. no fine-tuned or instruct models).
#
# Usage:
#
# cd llama.cpp
# make -j
#
# ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
#
if [ $# -lt 2 ]; then
echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
exit 1
fi
eargs=""
if [ $# -gt 2 ]; then
eargs="${@:3}"
fi
ftmp="__llama.cpp_example_tmp__.txt"
trap "rm -f $ftmp" EXIT
echo "Translate from English to French:
===
sea otter, peppermint, plush girafe:
sea otter => loutre de mer
peppermint => menthe poivrée
plush girafe => girafe peluche
===
violin
violin => violon
===
phone, computer, mouse, keyboard:
phone => téléphone
computer => ordinateur
mouse => souris
keyboard => clavier
===
" > $ftmp
echo "$2
" >> $ftmp
model=$1
# generate the most likely continuation until the string "===" is found
./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs

View File

@@ -88,10 +88,7 @@ int main(int argc, char ** argv) {
llama_model_params model_params = llama_model_default_params();
const std::vector<float> t_split (LLAMA_MAX_DEVICES, 0.0f);
model_params.n_gpu_layers = n_gpu_layers;
model_params.tensor_split = t_split.data();
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

View File

@@ -69,7 +69,6 @@ int main(int argc, char ** argv) {
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(model, params.prompt, true);
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
// initialize the context

View File

@@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
// Set up a the benchmark matrices
// printf("Creating new tensor q11 & Running quantize\n");
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
// Set up a the compute graph
// printf("Creating new tensor q31\n");
@@ -207,7 +207,7 @@ int main(int argc, char ** argv) {
// Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n");
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
// printf("Creating new tensor q32\n");
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);

View File

@@ -245,8 +245,9 @@ static struct lora_data * load_lora(struct lora_info * info) {
params_ggml.no_alloc = true;
result->ctx = ggml_init(params_ggml);
uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
uint32_t magic = file.read_u32();
if (magic != LLAMA_FILE_MAGIC_GGLA) {
if (magic != LLAMA_FILE_MAGIC_LORA) {
die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
}
uint32_t version = file.read_u32();
@@ -308,7 +309,7 @@ static struct ggml_cgraph * build_graph_lora(
) {
struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
if (scaling != 1.0f) {
ab = ggml_scale(ctx, ab, scaling);
ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling));
}
struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);

View File

@@ -61,7 +61,7 @@ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' L
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
```
The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.

View File

@@ -3,9 +3,15 @@
#include "llama.h"
#include "common.h"
#include "train.h"
#include <unordered_map>
#include <vector>
#include <cassert>
#include <climits>
#include <cstring>
#include <cstdarg>
#include <ctime>
#include <random>
#include <stdexcept>
#include <algorithm>
#include <string>
@@ -190,13 +196,13 @@ static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down";
static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up";
static void print_params(struct my_llama_hparams * params) {
printf("%s: n_vocab : %u\n", __func__, params->n_vocab);
printf("%s: n_ctx : %u\n", __func__, params->n_ctx);
printf("%s: n_embd : %u\n", __func__, params->n_embd);
printf("%s: n_ff : %u\n", __func__, params->n_ff);
printf("%s: n_head : %u\n", __func__, params->n_head);
printf("%s: n_head_kv : %u\n", __func__, params->n_head_kv);
printf("%s: n_layer : %u\n", __func__, params->n_layer);
printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
printf("%s: n_embd: %u\n", __func__, params->n_embd);
printf("%s: n_ff: %u\n", __func__, params->n_ff);
printf("%s: n_head: %u\n", __func__, params->n_head);
printf("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
printf("%s: n_layer: %u\n", __func__, params->n_layer);
printf("%s: norm_rms_eps : %f\n", __func__, params->f_norm_rms_eps);
printf("%s: rope_freq_base : %f\n", __func__, params->rope_freq_base);
printf("%s: rope_freq_scale : %f\n", __func__, params->rope_freq_scale);
@@ -263,7 +269,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h
float rope_freq_scale = 1.0f;
GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
if (rope_freq_scale != 1.0f) {
hparams->rope_freq_scale = 1.0f / rope_freq_scale;
}
@@ -606,7 +612,6 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
const int n_rot = hparams.n_embd_head();
const int n_embd_head = hparams.n_embd_head();
const int n_embd_gqa = hparams.n_embd_gqa();
const float rms_norm_eps = hparams.f_norm_rms_eps;
const float rope_freq_base = hparams.rope_freq_base;
const float rope_freq_scale = hparams.rope_freq_scale;
@@ -675,7 +680,10 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
checkpoints.push_back(t01);
}
const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head);
struct ggml_tensor * kv_scale = NULL;
if (!enable_flash_attn) {
kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head));
}
for (int il = 0; il < n_layer; ++il) {
struct my_llama_layer & layer = model->layers[il];
@@ -773,32 +781,32 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
// make sure some tensors are not reallocated by inserting new temporary nodes depending on them
int n_leafs_before = gb->n_leafs;
int n_nodes_before = gb->n_nodes;
struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f);
// output tensors
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
// input gradient
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
ggml_allocr_alloc(alloc, t36->grad);
// KQ_pos
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));
// make sure base model tensors data cannot be used in viewable operations
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, one));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, one));
for (int il = 0; il < n_layer; ++il) {
struct my_llama_layer & layer = model->layers[il];
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, one));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, one));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, one));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, one));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, one));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, one));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, one));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, one));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one));
}
// allocating checkpoints in one block to reduce memory fragmentation
@@ -1138,8 +1146,9 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
return tn_buf.data();
};
uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
// write_magic
file.write_u32(LLAMA_FILE_MAGIC_GGLA); // magic
file.write_u32(LLAMA_FILE_MAGIC_LORA); // magic
file.write_u32(1); // version
// write_hparams
file.write_u32(lora->hparams.lora_r);
@@ -1799,7 +1808,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> train_tokens;
std::vector<size_t> train_samples_begin;
std::vector<size_t> train_samples_size;
printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
printf("%s: tokenize training data\n", __func__);
tokenize_file(lctx,
params.common.fn_train_data,
params.common.sample_start,

View File

@@ -1,5 +1,5 @@
set(TARGET gguf)
add_executable(${TARGET} gguf.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,4 +1,5 @@
#include "ggml.h"
#include "llama.h"
#include <cstdio>
#include <cinttypes>

View File

@@ -1,5 +0,0 @@
set(TARGET imatrix)
add_executable(${TARGET} imatrix.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,380 +0,0 @@
#include "common.h"
#include "llama.h"
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <sstream>
#include <thread>
#include <mutex>
#include <vector>
#include <fstream>
#include <unordered_map>
#include <algorithm>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
struct Stats {
std::vector<float> values;
int ncall = 0;
};
struct StatParams {
std::string ofile = "imatrix.dat";
int n_output_frequency = 10;
int verbosity = 1;
bool collect_output_weight = false;
};
class IMatrixCollector {
public:
IMatrixCollector() = default;
void set_parameters(StatParams&& params) { m_params = std::move(params); }
void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
void save_imatrix() const;
private:
std::unordered_map<std::string, Stats> m_stats;
StatParams m_params;
std::mutex m_mutex;
int m_last_call = 0;
};
void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return;
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return;
std::lock_guard<std::mutex> lock(m_mutex);
auto& e = m_stats[src0->name];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ASSERT(false);
}
++e.ncall;
if (m_params.verbosity > 1) {
printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type);
}
for (int row = 0; row < (int)src1->ne[1]; ++row) {
const float * x = (const float *)src1->data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
}
}
if (e.ncall > m_last_call) {
m_last_call = e.ncall;
if (m_last_call % m_params.n_output_frequency == 0) {
save_imatrix();
}
}
}
void IMatrixCollector::save_imatrix() const {
const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str();
std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size();
out.write((const char*)&n_entries, sizeof(n_entries));
for (auto& p : m_stats) {
int len = p.first.size();
out.write((const char*)&len, sizeof(len));
out.write(p.first.c_str(), len);
out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
int nval = p.second.values.size();
out.write((const char*)&nval, sizeof(nval));
if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
}
if (m_params.verbosity > 0) {
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
}
}
static IMatrixCollector g_collector;
static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
g_collector.collect_imatrix(src0, src1);
}
struct results_log_softmax {
double log_softmax;
float logit;
float prob;
};
static std::vector<float> softmax(const std::vector<float>& logits) {
std::vector<float> probs(logits.size());
float max_logit = logits[0];
for (float v : logits) {
max_logit = std::max(max_logit, v);
}
double sum_exp = 0.0;
for (size_t i = 0; i < logits.size(); i++) {
// Subtract the maximum logit value from the current logit value for numerical stability
const float logit = logits[i] - max_logit;
const float exp_logit = expf(logit);
sum_exp += exp_logit;
probs[i] = exp_logit;
}
for (size_t i = 0; i < probs.size(); i++) {
probs[i] /= sum_exp;
}
return probs;
}
static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
float max_logit = logits[0];
for (int i = 1; i < n_vocab; ++i) {
max_logit = std::max(max_logit, logits[i]);
}
double sum_exp = 0.0;
for (int i = 0; i < n_vocab; ++i) {
sum_exp += expf(logits[i] - max_logit);
}
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
}
static void process_logits(
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
double & nll, double & nll2, float * logit_history, float * prob_history
) {
std::mutex mutex;
int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
double local_nll = 0;
double local_nll2 = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int i = counter++;
if (i >= n_token) {
nll += local_nll; nll2 += local_nll2;
break;
}
lock.unlock();
const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
const double v = -results.log_softmax;
local_nll += v;
local_nll2 += v*v;
logit_history[i] = results.logit;
prob_history[i] = results.prob;
}
};
for (auto & w : workers) {
w = std::thread(compute);
}
compute();
for (auto & w : workers) {
w.join();
}
}
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
n_ctx);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return false;
}
std::vector<float> logit_history;
logit_history.resize(tokens.size());
std::vector<float> prob_history;
prob_history.resize(tokens.size());
const int n_chunk_max = tokens.size() / n_ctx;
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_batch = params.n_batch;
int count = 0;
double nll = 0.0;
double nll2 = 0.0;
fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx;
const int end = start + n_ctx;
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
std::vector<float> logits;
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_kv_cache_clear(ctx);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
// save original token and restore it after eval
const auto token_org = tokens[batch_start];
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
// restore the original token in case it was set to BOS
tokens[batch_start] = token_org;
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
}
const int first = n_ctx/2;
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += n_ctx - first - 1;
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
}
printf("\n");
nll2 /= count;
nll /= count;
const double ppl = exp(nll);
nll2 -= nll * nll;
if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1));
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else {
printf("Unexpected negative standard deviation of log(prob)\n");
}
return true;
}
int main(int argc, char ** argv) {
StatParams sparams;
std::vector<char*> args;
args.push_back(argv[0]);
int iarg = 1;
for (; iarg < argc-1; ++iarg) {
std::string arg{argv[iarg]};
if (arg == "-o" || arg == "--output-file") {
sparams.ofile = argv[++iarg];
}
else if (arg == "-ofreq" || arg == "--output-frequency") {
sparams.n_output_frequency = std::stoi(argv[++iarg]);
}
else if (arg == "-ow" || arg == "--output-weight") {
sparams.collect_output_weight = std::stoi(argv[++iarg]);
}
else if (arg == "--verbosity") {
sparams.verbosity = std::stoi(argv[++iarg]);
} else {
args.push_back(argv[iarg]);
}
}
if (iarg < argc) {
args.push_back(argv[iarg]);
}
gpt_params params;
params.n_batch = 512;
if (!gpt_params_parse(args.size(), args.data(), params)) {
return 1;
}
g_collector.set_parameters(std::move(sparams));
ggml_set_imatrix_collection(ik_collect_imatrix);
params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_backend_init(params.numa);
llama_model * model;
llama_context * ctx;
// load the model and apply lora adapter, if any
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
const int n_ctx_train = llama_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str());
}
bool OK = compute_imatrix(ctx, params);
if (!OK) {
return 1;
}
g_collector.save_imatrix();
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
return 0;
}

View File

@@ -128,25 +128,6 @@ static std::string get_gpu_info() {
// command line params
enum output_formats {CSV, JSON, MARKDOWN, SQL};
static const char * output_format_str(output_formats format) {
switch (format) {
case CSV: return "csv";
case JSON: return "json";
case MARKDOWN: return "md";
case SQL: return "sql";
default: GGML_ASSERT(!"invalid output format");
}
}
static const char * split_mode_str(llama_split_mode mode) {
switch (mode) {
case LLAMA_SPLIT_NONE: return "none";
case LLAMA_SPLIT_LAYER: return "layer";
case LLAMA_SPLIT_ROW: return "row";
default: GGML_ASSERT(!"invalid split mode");
}
}
struct cmd_params {
std::vector<std::string> model;
std::vector<int> n_prompt;
@@ -156,9 +137,7 @@ struct cmd_params {
std::vector<ggml_type> type_v;
std::vector<int> n_threads;
std::vector<int> n_gpu_layers;
std::vector<llama_split_mode> split_mode;
std::vector<int> main_gpu;
std::vector<bool> no_kv_offload;
std::vector<bool> mul_mat_q;
std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
int reps;
@@ -175,9 +154,7 @@ static const cmd_params cmd_params_defaults = {
/* type_v */ {GGML_TYPE_F16},
/* n_threads */ {get_num_physical_cores()},
/* n_gpu_layers */ {99},
/* split_mode */ {LLAMA_SPLIT_LAYER},
/* main_gpu */ {0},
/* no_kv_offload */ {false},
/* mul_mat_q */ {true},
/* tensor_split */ {{}},
/* reps */ 5,
@@ -190,22 +167,20 @@ static void print_usage(int /* argc */, char ** argv) {
printf("\n");
printf("options:\n");
printf(" -h, --help\n");
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
printf(" -ts, --tensor_split <ts0/ts1/..> \n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
}
@@ -328,41 +303,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
} else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<std::string>(argv[i], split_delim);
std::vector<llama_split_mode> modes;
for (const auto & m : p) {
llama_split_mode mode;
if (m == "none") {
mode = LLAMA_SPLIT_NONE;
} else if (m == "layer") {
mode = LLAMA_SPLIT_LAYER;
} else if (m == "row") {
mode = LLAMA_SPLIT_ROW;
} else {
invalid_param = true;
break;
}
modes.push_back(mode);
}
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
} else if (arg == "-mg" || arg == "--main-gpu") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.main_gpu = split<int>(argv[i], split_delim);
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
} else if (arg == "-mmq" || arg == "--mul-mat-q") {
if (++i >= argc) {
invalid_param = true;
@@ -436,9 +382,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
@@ -455,9 +399,7 @@ struct cmd_params_instance {
ggml_type type_v;
int n_threads;
int n_gpu_layers;
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool mul_mat_q;
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
@@ -465,7 +407,6 @@ struct cmd_params_instance {
llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = n_gpu_layers;
mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data();
@@ -475,7 +416,6 @@ struct cmd_params_instance {
bool equal_mparams(const cmd_params_instance & other) const {
return model == other.model &&
n_gpu_layers == other.n_gpu_layers &&
split_mode == other.split_mode &&
main_gpu == other.main_gpu &&
tensor_split == other.tensor_split;
}
@@ -488,26 +428,54 @@ struct cmd_params_instance {
cparams.type_k = type_k;
cparams.type_v = type_v;
cparams.mul_mat_q = mul_mat_q;
cparams.offload_kqv = !no_kv_offload;
return cparams;
}
};
static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) {
std::vector<cmd_params_instance> instances;
// this ordering minimizes the number of times that each model needs to be reloaded
for (const auto & m : params.model)
for (const auto & nl : params.n_gpu_layers)
for (const auto & sm : params.split_mode)
for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split)
for (const auto & nb : params.n_batch)
for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v)
for (const auto & mmq : params.mul_mat_q)
for (const auto & nkvo : params.no_kv_offload)
for (const auto & nt : params.n_threads) {
cmd_params_instance instance = {
/* .model = */ m,
/* .n_prompt = */ n_prompt,
/* .n_gen = */ n_gen,
/* .n_batch = */ nb,
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .main_gpu = */ mg,
/* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts,
};
instances.push_back(instance);
}
return instances;
}
static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
std::vector<cmd_params_instance> instances;
#if 1
// this ordering minimizes the number of times that each model needs to be reloaded
for (const auto & m : params.model)
for (const auto & nl : params.n_gpu_layers)
for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split)
for (const auto & nb : params.n_batch)
for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v)
for (const auto & mmq : params.mul_mat_q)
for (const auto & nt : params.n_threads) {
for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) {
@@ -522,9 +490,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts,
};
@@ -544,15 +510,31 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts,
};
instances.push_back(instance);
}
}
#else
// this ordering separates the prompt and generation tests
for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) {
continue;
}
auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt);
instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end());
}
for (const auto & n_gen : params.n_gen) {
if (n_gen == 0) {
continue;
}
auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
}
#endif
return instances;
}
@@ -576,9 +558,7 @@ struct test {
ggml_type type_k;
ggml_type type_v;
int n_gpu_layers;
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool mul_mat_q;
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
int n_prompt;
@@ -598,9 +578,7 @@ struct test {
type_k = inst.type_k;
type_v = inst.type_v;
n_gpu_layers = inst.n_gpu_layers;
split_mode = inst.split_mode;
main_gpu = inst.main_gpu;
no_kv_offload = inst.no_kv_offload;
mul_mat_q = inst.mul_mat_q;
tensor_split = inst.tensor_split;
n_prompt = inst.n_prompt;
@@ -662,9 +640,7 @@ struct test {
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_threads", "type_k", "type_v",
"n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload",
"mul_mat_q", "tensor_split",
"n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
"avg_ts", "stddev_ts"
@@ -683,7 +659,7 @@ struct test {
return INT;
}
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
field == "f16_kv" || field == "mul_mat_q") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts") {
@@ -714,9 +690,7 @@ struct test {
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload),
std::to_string(mul_mat_q), tensor_split_str,
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -871,18 +845,12 @@ struct markdown_printer : public printer {
if (field == "n_gpu_layers") {
return "ngl";
}
if (field == "split_mode") {
return "sm";
}
if (field == "n_threads") {
return "threads";
}
if (field == "mul_mat_q") {
return "mmq";
}
if (field == "no_kv_offload") {
return "nkvo";
}
if (field == "tensor_split") {
return "ts";
}
@@ -914,15 +882,9 @@ struct markdown_printer : public printer {
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
fields.push_back("main_gpu");
}
if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
fields.push_back("split_mode");
}
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
fields.push_back("mul_mat_q");
}
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
fields.push_back("no_kv_offload");
}
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
fields.push_back("tensor_split");
}

View File

@@ -1,33 +0,0 @@
# Gradle files
.gradle/
build/
# Local configuration file (sdk path, etc)
local.properties
# Log/OS Files
*.log
# Android Studio generated files and folders
captures/
.externalNativeBuild/
.cxx/
*.apk
output.json
# IntelliJ
*.iml
.idea/
misc.xml
deploymentTargetDropDown.xml
render.experimental.xml
# Keystore files
*.jks
*.keystore
# Google Services (e.g. APIs or Firebase)
google-services.json
# Android Profiling
*.hprof

View File

@@ -1 +0,0 @@
/build

View File

@@ -1,91 +0,0 @@
plugins {
id("com.android.application")
id("org.jetbrains.kotlin.android")
}
android {
namespace = "com.example.llama"
compileSdk = 34
ndkVersion = "26.1.10909125"
defaultConfig {
applicationId = "com.example.llama"
minSdk = 33
targetSdk = 34
versionCode = 1
versionName = "1.0"
testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
vectorDrawables {
useSupportLibrary = true
}
ndk {
// Workaround for https://github.com/llvm/llvm-project/issues/65820
// affecting armeabi-v7a. Skip armeabi-v7a when invoked with
// -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a).
if (project.hasProperty("skip-armeabi-v7a")) {
abiFilters += listOf("arm64-v8a", "x86_64", "x86")
}
}
externalNativeBuild {
cmake {
cppFlags += listOf()
arguments += listOf()
}
}
}
buildTypes {
release {
isMinifyEnabled = false
proguardFiles(
getDefaultProguardFile("proguard-android-optimize.txt"),
"proguard-rules.pro"
)
}
}
compileOptions {
sourceCompatibility = JavaVersion.VERSION_1_8
targetCompatibility = JavaVersion.VERSION_1_8
}
kotlinOptions {
jvmTarget = "1.8"
}
buildFeatures {
compose = true
}
composeOptions {
kotlinCompilerExtensionVersion = "1.5.1"
}
packaging {
resources {
excludes += "/META-INF/{AL2.0,LGPL2.1}"
}
}
externalNativeBuild {
cmake {
path = file("src/main/cpp/CMakeLists.txt")
version = "3.22.1"
}
}
}
dependencies {
implementation("androidx.core:core-ktx:1.12.0")
implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
implementation("androidx.activity:activity-compose:1.8.2")
implementation(platform("androidx.compose:compose-bom:2023.08.00"))
implementation("androidx.compose.ui:ui")
implementation("androidx.compose.ui:ui-graphics")
implementation("androidx.compose.ui:ui-tooling-preview")
implementation("androidx.compose.material3:material3")
testImplementation("junit:junit:4.13.2")
androidTestImplementation("androidx.test.ext:junit:1.1.5")
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
androidTestImplementation("androidx.compose.ui:ui-test-junit4")
debugImplementation("androidx.compose.ui:ui-tooling")
debugImplementation("androidx.compose.ui:ui-test-manifest")
}

View File

@@ -1,21 +0,0 @@
# Add project specific ProGuard rules here.
# You can control the set of applied configuration files using the
# proguardFiles setting in build.gradle.
#
# For more details, see
# http://developer.android.com/guide/developing/tools/proguard.html
# If your project uses WebView with JS, uncomment the following
# and specify the fully qualified class name to the JavaScript interface
# class:
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
# public *;
#}
# Uncomment this to preserve the line number information for
# debugging stack traces.
#-keepattributes SourceFile,LineNumberTable
# If you keep the line number information, uncomment this to
# hide the original source file name.
#-renamesourcefileattribute SourceFile

View File

@@ -1,30 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
xmlns:tools="http://schemas.android.com/tools">
<uses-permission android:name="android.permission.INTERNET" />
<application
android:allowBackup="true"
android:dataExtractionRules="@xml/data_extraction_rules"
android:fullBackupContent="@xml/backup_rules"
android:icon="@mipmap/ic_launcher"
android:label="@string/app_name"
android:roundIcon="@mipmap/ic_launcher_round"
android:supportsRtl="true"
android:theme="@style/Theme.LlamaAndroid"
>
<activity
android:name=".MainActivity"
android:exported="true"
android:theme="@style/Theme.LlamaAndroid">
<intent-filter>
<action android:name="android.intent.action.MAIN" />
<category android:name="android.intent.category.LAUNCHER" />
</intent-filter>
</activity>
</application>
</manifest>

View File

@@ -1,50 +0,0 @@
# For more information about using CMake with Android Studio, read the
# documentation: https://d.android.com/studio/projects/add-native-code.html.
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
# Sets the minimum CMake version required for this project.
cmake_minimum_required(VERSION 3.22.1)
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
# Since this is the top level CMakeLists.txt, the project name is also accessible
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
# build script scope).
project("llama-android")
include(FetchContent)
FetchContent_Declare(
llama
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
GIT_TAG master
)
# Also provides "common"
FetchContent_MakeAvailable(llama)
# Creates and names a library, sets it as either STATIC
# or SHARED, and provides the relative paths to its source code.
# You can define multiple libraries, and CMake builds them for you.
# Gradle automatically packages shared libraries with your APK.
#
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
# is preferred for the same purpose.
#
# In order to load a library into your app from Java/Kotlin, you must call
# System.loadLibrary() and pass the name of the library defined here;
# for GameActivity/NativeActivity derived applications, the same library name must be
# used in the AndroidManifest.xml file.
add_library(${CMAKE_PROJECT_NAME} SHARED
# List C/C++ source files with relative paths to this CMakeLists.txt.
llama-android.cpp)
# Specifies libraries CMake should link to your target library. You
# can link libraries from various origins, such as libraries defined in this
# build script, prebuilt third-party libraries, or Android system libraries.
target_link_libraries(${CMAKE_PROJECT_NAME}
# List libraries link to the target library
llama
common
android
log)

View File

@@ -1,394 +0,0 @@
#include <android/log.h>
#include <jni.h>
#include <iomanip>
#include <math.h>
#include <string>
#include <unistd.h>
#include "llama.h"
#include "common/common.h"
// Write C++ code here.
//
// Do not forget to dynamically load the C++ library into your application.
//
// For instance,
//
// In MainActivity.java:
// static {
// System.loadLibrary("llama-android");
// }
//
// Or, in MainActivity.kt:
// companion object {
// init {
// System.loadLibrary("llama-android")
// }
// }
#define TAG "llama-android.cpp"
#define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
#define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
jclass la_int_var;
jmethodID la_int_var_value;
jmethodID la_int_var_inc;
static void log_callback(ggml_log_level level, const char * fmt, void * data) {
if (level == GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
else if (level == GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data);
else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data);
}
extern "C"
JNIEXPORT jlong JNICALL
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
llama_model_params model_params = llama_model_default_params();
auto path_to_model = env->GetStringUTFChars(filename, 0);
LOGi("Loading model from %s", path_to_model);
auto model = llama_load_model_from_file(path_to_model, model_params);
env->ReleaseStringUTFChars(filename, path_to_model);
if (!model) {
LOGe("load_model() failed");
env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "load_model() failed");
return 0;
}
return reinterpret_cast<jlong>(model);
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
llama_free_model(reinterpret_cast<llama_model *>(model));
}
extern "C"
JNIEXPORT jlong JNICALL
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
auto model = reinterpret_cast<llama_model *>(jmodel);
if (!model) {
LOGe("new_context(): model cannot be null");
env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "Model cannot be null");
return 0;
}
int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2));
LOGi("Using %d threads", n_threads);
llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = 2048;
ctx_params.n_threads = n_threads;
ctx_params.n_threads_batch = n_threads;
llama_context * context = llama_new_context_with_model(model, ctx_params);
if (!context) {
LOGe("llama_new_context_with_model() returned null)");
env->ThrowNew(env->FindClass("java/lang/IllegalStateException"),
"llama_new_context_with_model() returned null)");
return 0;
}
return reinterpret_cast<jlong>(context);
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
llama_free(reinterpret_cast<llama_context *>(context));
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
llama_backend_free();
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
llama_log_set(log_callback, NULL);
}
extern "C"
JNIEXPORT jstring JNICALL
Java_com_example_llama_Llm_bench_1model(
JNIEnv *env,
jobject,
jlong context_pointer,
jlong model_pointer,
jlong batch_pointer,
jint pp,
jint tg,
jint pl,
jint nr
) {
auto pp_avg = 0.0;
auto tg_avg = 0.0;
auto pp_std = 0.0;
auto tg_std = 0.0;
const auto context = reinterpret_cast<llama_context *>(context_pointer);
const auto model = reinterpret_cast<llama_model *>(model_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
const int n_ctx = llama_n_ctx(context);
LOGi("n_ctx = %d", n_ctx);
int i, j;
int nri;
for (nri = 0; nri < nr; nri++) {
LOGi("Benchmark prompt processing (pp)");
llama_batch_clear(*batch);
const int n_tokens = pp;
for (i = 0; i < n_tokens; i++) {
llama_batch_add(*batch, 0, i, { 0 }, false);
}
batch->logits[batch->n_tokens - 1] = true;
llama_kv_cache_clear(context);
const auto t_pp_start = ggml_time_us();
if (llama_decode(context, *batch) != 0) {
LOGi("llama_decode() failed during prompt processing");
}
const auto t_pp_end = ggml_time_us();
// bench text generation
LOGi("Benchmark text generation (tg)");
llama_kv_cache_clear(context);
const auto t_tg_start = ggml_time_us();
for (i = 0; i < tg; i++) {
llama_batch_clear(*batch);
for (j = 0; j < pl; j++) {
llama_batch_add(*batch, 0, i, { j }, true);
}
LOGi("llama_decode() text generation: %d", i);
if (llama_decode(context, *batch) != 0) {
LOGi("llama_decode() failed during text generation");
}
}
const auto t_tg_end = ggml_time_us();
llama_kv_cache_clear(context);
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
const auto speed_pp = double(pp) / t_pp;
const auto speed_tg = double(pl * tg) / t_tg;
pp_avg += speed_pp;
tg_avg += speed_tg;
pp_std += speed_pp * speed_pp;
tg_std += speed_tg * speed_tg;
LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg);
}
pp_avg /= double(nr);
tg_avg /= double(nr);
if (nr > 1) {
pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1));
tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1));
} else {
pp_std = 0;
tg_std = 0;
}
char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc));
const auto model_size = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0;
const auto model_n_params = double(llama_model_n_params(model)) / 1e9;
const auto backend = "(Android)"; // TODO: What should this be?
std::stringstream result;
result << std::setprecision(2);
result << "| model | size | params | backend | test | t/s |\n";
result << "| --- | --- | --- | --- | --- | --- |\n";
result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n";
result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n";
return env->NewStringUTF(result.str().c_str());
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
}
extern "C"
JNIEXPORT jlong JNICALL
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
llama_batch *batch = new llama_batch {
0,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
0,
0,
0,
};
if (embd) {
batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd);
} else {
batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
}
batch->pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
batch->n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
batch->seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
for (int i = 0; i < n_tokens; ++i) {
batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
}
batch->logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
return reinterpret_cast<jlong>(batch);
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) {
llama_backend_init(numa);
}
extern "C"
JNIEXPORT jstring JNICALL
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
return env->NewStringUTF(llama_print_system_info());
}
extern "C"
JNIEXPORT jint JNICALL
Java_com_example_llama_Llm_completion_1init(
JNIEnv *env,
jobject,
jlong context_pointer,
jlong batch_pointer,
jstring jtext,
jint n_len
) {
const auto text = env->GetStringUTFChars(jtext, 0);
const auto context = reinterpret_cast<llama_context *>(context_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
const auto tokens_list = llama_tokenize(context, text, 1);
auto n_ctx = llama_n_ctx(context);
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
if (n_kv_req > n_ctx) {
LOGe("error: n_kv_req > n_ctx, the required KV cache size is not big enough");
}
for (auto id : tokens_list) {
LOGi("%s", llama_token_to_piece(context, id).c_str());
}
llama_batch_clear(*batch);
// evaluate the initial prompt
for (auto i = 0; i < tokens_list.size(); i++) {
llama_batch_add(*batch, tokens_list[i], i, { 0 }, false);
}
// llama_decode will output logits only for the last token of the prompt
batch->logits[batch->n_tokens - 1] = true;
if (llama_decode(context, *batch) != 0) {
LOGe("llama_decode() failed");
}
env->ReleaseStringUTFChars(jtext, text);
return batch->n_tokens;
}
extern "C"
JNIEXPORT jstring JNICALL
Java_com_example_llama_Llm_completion_1loop(
JNIEnv * env,
jobject,
jlong context_pointer,
jlong batch_pointer,
jint n_len,
jobject intvar_ncur
) {
const auto context = reinterpret_cast<llama_context *>(context_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
const auto model = llama_get_model(context);
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
auto n_vocab = llama_n_vocab(model);
auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// sample the most likely token
const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
return env->NewStringUTF("");
}
auto new_token_chars = llama_token_to_piece(context, new_token_id);
LOGi("new_token_chars: `%s`", new_token_chars.c_str());
auto new_token = env->NewStringUTF(new_token_chars.c_str());
llama_batch_clear(*batch);
llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
env->CallVoidMethod(intvar_ncur, la_int_var_inc);
if (llama_decode(context, *batch) != 0) {
LOGe("llama_decode() returned null");
}
return new_token;
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
}

View File

@@ -1,119 +0,0 @@
package com.example.llama
import android.app.DownloadManager
import android.net.Uri
import android.util.Log
import androidx.compose.material3.Button
import androidx.compose.material3.Text
import androidx.compose.runtime.Composable
import androidx.compose.runtime.getValue
import androidx.compose.runtime.mutableDoubleStateOf
import androidx.compose.runtime.mutableStateOf
import androidx.compose.runtime.remember
import androidx.compose.runtime.rememberCoroutineScope
import androidx.compose.runtime.setValue
import androidx.core.database.getLongOrNull
import androidx.core.net.toUri
import kotlinx.coroutines.delay
import kotlinx.coroutines.launch
import java.io.File
data class Downloadable(val name: String, val source: Uri, val destination: File) {
companion object {
@JvmStatic
private val tag: String? = this::class.qualifiedName
sealed interface State
data object Ready: State
data class Downloading(val id: Long): State
data class Downloaded(val downloadable: Downloadable): State
data class Error(val message: String): State
@JvmStatic
@Composable
fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) {
var status: State by remember {
mutableStateOf(
if (item.destination.exists()) Downloaded(item)
else Ready
)
}
var progress by remember { mutableDoubleStateOf(0.0) }
val coroutineScope = rememberCoroutineScope()
suspend fun waitForDownload(result: Downloading, item: Downloadable): State {
while (true) {
val cursor = dm.query(DownloadManager.Query().setFilterById(result.id))
if (cursor == null) {
Log.e(tag, "dm.query() returned null")
return Error("dm.query() returned null")
}
if (!cursor.moveToFirst() || cursor.count < 1) {
cursor.close()
Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?")
return Ready
}
val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR)
val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES)
val sofar = cursor.getLongOrNull(pix) ?: 0
val total = cursor.getLongOrNull(tix) ?: 1
cursor.close()
if (sofar == total) {
return Downloaded(item)
}
progress = (sofar * 1.0) / total
delay(1000L)
}
}
fun onClick() {
when (val s = status) {
is Downloaded -> {
viewModel.load(item.destination.path)
}
is Downloading -> {
coroutineScope.launch {
status = waitForDownload(s, item)
}
}
else -> {
item.destination.delete()
val request = DownloadManager.Request(item.source).apply {
setTitle("Downloading model")
setDescription("Downloading model: ${item.name}")
setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI)
setDestinationUri(item.destination.toUri())
}
viewModel.log("Saving ${item.name} to ${item.destination.path}")
Log.i(tag, "Saving ${item.name} to ${item.destination.path}")
val id = dm.enqueue(request)
status = Downloading(id)
onClick()
}
}
}
Button(onClick = { onClick() }, enabled = status !is Downloading) {
when (status) {
is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%")
is Downloaded -> Text("Load ${item.name}")
is Ready -> Text("Download ${item.name}")
is Error -> Text("Download ${item.name}")
}
}
}
}
}

View File

@@ -1,172 +0,0 @@
package com.example.llama
import android.util.Log
import kotlinx.coroutines.CoroutineDispatcher
import kotlinx.coroutines.asCoroutineDispatcher
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.flow
import kotlinx.coroutines.flow.flowOn
import kotlinx.coroutines.withContext
import java.util.concurrent.Executors
import kotlin.concurrent.thread
class Llm {
private val tag: String? = this::class.simpleName
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
private val runLoop: CoroutineDispatcher = Executors.newSingleThreadExecutor {
thread(start = false, name = "Llm-RunLoop") {
Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}")
// No-op if called more than once.
System.loadLibrary("llama-android")
// Set llama log handler to Android
log_to_android()
backend_init(false)
Log.d(tag, system_info())
it.run()
}.apply {
uncaughtExceptionHandler = Thread.UncaughtExceptionHandler { _, exception: Throwable ->
Log.e(tag, "Unhandled exception", exception)
}
}
}.asCoroutineDispatcher()
private val nlen: Int = 64
private external fun log_to_android()
private external fun load_model(filename: String): Long
private external fun free_model(model: Long)
private external fun new_context(model: Long): Long
private external fun free_context(context: Long)
private external fun backend_init(numa: Boolean)
private external fun backend_free()
private external fun free_batch(batch: Long)
private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
private external fun bench_model(
context: Long,
model: Long,
batch: Long,
pp: Int,
tg: Int,
pl: Int,
nr: Int
): String
private external fun system_info(): String
private external fun completion_init(
context: Long,
batch: Long,
text: String,
nLen: Int
): Int
private external fun completion_loop(
context: Long,
batch: Long,
nLen: Int,
ncur: IntVar
): String
private external fun kv_cache_clear(context: Long)
suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String {
return withContext(runLoop) {
when (val state = threadLocalState.get()) {
is State.Loaded -> {
Log.d(tag, "bench(): $state")
bench_model(state.context, state.model, state.batch, pp, tg, pl, nr)
}
else -> throw IllegalStateException("No model loaded")
}
}
}
suspend fun load(pathToModel: String) {
withContext(runLoop) {
when (threadLocalState.get()) {
is State.Idle -> {
val model = load_model(pathToModel)
if (model == 0L) throw IllegalStateException("load_model() failed")
val context = new_context(model)
if (context == 0L) throw IllegalStateException("new_context() failed")
val batch = new_batch(512, 0, 1)
if (batch == 0L) throw IllegalStateException("new_batch() failed")
Log.i(tag, "Loaded model $pathToModel")
threadLocalState.set(State.Loaded(model, context, batch))
}
else -> throw IllegalStateException("Model already loaded")
}
}
}
fun send(message: String): Flow<String> = flow {
when (val state = threadLocalState.get()) {
is State.Loaded -> {
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
while (ncur.value <= nlen) {
val str = completion_loop(state.context, state.batch, nlen, ncur)
if (str.isEmpty()) {
break
}
emit(str)
}
kv_cache_clear(state.context)
}
else -> {}
}
}.flowOn(runLoop)
/**
* Unloads the model and frees resources.
*
* This is a no-op if there's no model loaded.
*/
suspend fun unload() {
withContext(runLoop) {
when (val state = threadLocalState.get()) {
is State.Loaded -> {
free_context(state.context)
free_model(state.model)
free_batch(state.batch)
threadLocalState.set(State.Idle)
}
else -> {}
}
}
}
companion object {
private class IntVar(value: Int) {
@Volatile
var value: Int = value
private set
fun inc() {
synchronized(this) {
value += 1
}
}
}
private sealed interface State {
data object Idle: State
data class Loaded(val model: Long, val context: Long, val batch: Long): State
}
// Enforce only one instance of Llm.
private val _instance: Llm = Llm()
fun instance(): Llm = _instance
}
}

View File

@@ -1,154 +0,0 @@
package com.example.llama
import android.app.ActivityManager
import android.app.DownloadManager
import android.content.ClipData
import android.content.ClipboardManager
import android.net.Uri
import android.os.Bundle
import android.os.StrictMode
import android.os.StrictMode.VmPolicy
import android.text.format.Formatter
import androidx.activity.ComponentActivity
import androidx.activity.compose.setContent
import androidx.activity.viewModels
import androidx.compose.foundation.layout.Box
import androidx.compose.foundation.layout.Column
import androidx.compose.foundation.layout.Row
import androidx.compose.foundation.layout.fillMaxSize
import androidx.compose.foundation.layout.padding
import androidx.compose.foundation.lazy.LazyColumn
import androidx.compose.foundation.lazy.items
import androidx.compose.foundation.lazy.rememberLazyListState
import androidx.compose.material3.Button
import androidx.compose.material3.LocalContentColor
import androidx.compose.material3.MaterialTheme
import androidx.compose.material3.OutlinedTextField
import androidx.compose.material3.Surface
import androidx.compose.material3.Text
import androidx.compose.runtime.Composable
import androidx.compose.ui.Modifier
import androidx.compose.ui.unit.dp
import androidx.core.content.getSystemService
import com.example.llama.ui.theme.LlamaAndroidTheme
import java.io.File
class MainActivity(
activityManager: ActivityManager? = null,
downloadManager: DownloadManager? = null,
clipboardManager: ClipboardManager? = null,
): ComponentActivity() {
private val tag: String? = this::class.simpleName
private val activityManager by lazy { activityManager ?: getSystemService<ActivityManager>()!! }
private val downloadManager by lazy { downloadManager ?: getSystemService<DownloadManager>()!! }
private val clipboardManager by lazy { clipboardManager ?: getSystemService<ClipboardManager>()!! }
private val viewModel: MainViewModel by viewModels()
// Get a MemoryInfo object for the device's current memory status.
private fun availableMemory(): ActivityManager.MemoryInfo {
return ActivityManager.MemoryInfo().also { memoryInfo ->
activityManager.getMemoryInfo(memoryInfo)
}
}
override fun onCreate(savedInstanceState: Bundle?) {
super.onCreate(savedInstanceState)
StrictMode.setVmPolicy(
VmPolicy.Builder(StrictMode.getVmPolicy())
.detectLeakedClosableObjects()
.build()
)
val free = Formatter.formatFileSize(this, availableMemory().availMem)
val total = Formatter.formatFileSize(this, availableMemory().totalMem)
viewModel.log("Current memory: $free / $total")
viewModel.log("Downloads directory: ${getExternalFilesDir(null)}")
val extFilesDir = getExternalFilesDir(null)
val models = listOf(
Downloadable(
"Phi-2 7B (Q4_0, 1.6 GiB)",
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
File(extFilesDir, "phi-2-q4_0.gguf"),
),
Downloadable(
"TinyLlama 1.1B (f16, 2.2 GiB)",
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
File(extFilesDir, "tinyllama-1.1-f16.gguf"),
),
Downloadable(
"Phi 2 DPO (Q3_K_M, 1.48 GiB)",
Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
),
)
setContent {
LlamaAndroidTheme {
// A surface container using the 'background' color from the theme
Surface(
modifier = Modifier.fillMaxSize(),
color = MaterialTheme.colorScheme.background
) {
MainCompose(
viewModel,
clipboardManager,
downloadManager,
models,
)
}
}
}
}
}
@Composable
fun MainCompose(
viewModel: MainViewModel,
clipboard: ClipboardManager,
dm: DownloadManager,
models: List<Downloadable>
) {
Column {
val scrollState = rememberLazyListState()
Box(modifier = Modifier.weight(1f)) {
LazyColumn(state = scrollState) {
items(viewModel.messages) {
Text(
it,
style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current),
modifier = Modifier.padding(16.dp)
)
}
}
}
OutlinedTextField(
value = viewModel.message,
onValueChange = { viewModel.updateMessage(it) },
label = { Text("Message") },
)
Row {
Button({ viewModel.send() }) { Text("Send") }
Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") }
Button({ viewModel.clear() }) { Text("Clear") }
Button({
viewModel.messages.joinToString("\n").let {
clipboard.setPrimaryClip(ClipData.newPlainText("", it))
}
}) { Text("Copy") }
}
Column {
for (model in models) {
Downloadable.Button(viewModel, dm, model)
}
}
}
}

View File

@@ -1,104 +0,0 @@
package com.example.llama
import android.util.Log
import androidx.compose.runtime.getValue
import androidx.compose.runtime.mutableStateOf
import androidx.compose.runtime.setValue
import androidx.lifecycle.ViewModel
import androidx.lifecycle.viewModelScope
import kotlinx.coroutines.flow.catch
import kotlinx.coroutines.launch
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
companion object {
@JvmStatic
private val NanosPerSecond = 1_000_000_000.0
}
private val tag: String? = this::class.simpleName
var messages by mutableStateOf(listOf("Initializing..."))
private set
var message by mutableStateOf("")
private set
override fun onCleared() {
super.onCleared()
viewModelScope.launch {
try {
llm.unload()
} catch (exc: IllegalStateException) {
messages += exc.message!!
}
}
}
fun send() {
val text = message
message = ""
// Add to messages console.
messages += text
messages += ""
viewModelScope.launch {
llm.send(text)
.catch {
Log.e(tag, "send() failed", it)
messages += it.message!!
}
.collect { messages = messages.dropLast(1) + (messages.last() + it) }
}
}
fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) {
viewModelScope.launch {
try {
val start = System.nanoTime()
val warmupResult = llm.bench(pp, tg, pl, nr)
val end = System.nanoTime()
messages += warmupResult
val warmup = (end - start).toDouble() / NanosPerSecond
messages += "Warm up time: $warmup seconds, please wait..."
if (warmup > 5.0) {
messages += "Warm up took too long, aborting benchmark"
return@launch
}
messages += llm.bench(512, 128, 1, 3)
} catch (exc: IllegalStateException) {
Log.e(tag, "bench() failed", exc)
messages += exc.message!!
}
}
}
fun load(pathToModel: String) {
viewModelScope.launch {
try {
llm.load(pathToModel)
messages += "Loaded $pathToModel"
} catch (exc: IllegalStateException) {
Log.e(tag, "load() failed", exc)
messages += exc.message!!
}
}
}
fun updateMessage(newMessage: String) {
message = newMessage
}
fun clear() {
messages = listOf()
}
fun log(message: String) {
messages += message
}
}

View File

@@ -1,11 +0,0 @@
package com.example.llama.ui.theme
import androidx.compose.ui.graphics.Color
val Purple80 = Color(0xFFD0BCFF)
val PurpleGrey80 = Color(0xFFCCC2DC)
val Pink80 = Color(0xFFEFB8C8)
val Purple40 = Color(0xFF6650a4)
val PurpleGrey40 = Color(0xFF625b71)
val Pink40 = Color(0xFF7D5260)

View File

@@ -1,70 +0,0 @@
package com.example.llama.ui.theme
import android.app.Activity
import android.os.Build
import androidx.compose.foundation.isSystemInDarkTheme
import androidx.compose.material3.MaterialTheme
import androidx.compose.material3.darkColorScheme
import androidx.compose.material3.dynamicDarkColorScheme
import androidx.compose.material3.dynamicLightColorScheme
import androidx.compose.material3.lightColorScheme
import androidx.compose.runtime.Composable
import androidx.compose.runtime.SideEffect
import androidx.compose.ui.graphics.toArgb
import androidx.compose.ui.platform.LocalContext
import androidx.compose.ui.platform.LocalView
import androidx.core.view.WindowCompat
private val DarkColorScheme = darkColorScheme(
primary = Purple80,
secondary = PurpleGrey80,
tertiary = Pink80
)
private val LightColorScheme = lightColorScheme(
primary = Purple40,
secondary = PurpleGrey40,
tertiary = Pink40
/* Other default colors to override
background = Color(0xFFFFFBFE),
surface = Color(0xFFFFFBFE),
onPrimary = Color.White,
onSecondary = Color.White,
onTertiary = Color.White,
onBackground = Color(0xFF1C1B1F),
onSurface = Color(0xFF1C1B1F),
*/
)
@Composable
fun LlamaAndroidTheme(
darkTheme: Boolean = isSystemInDarkTheme(),
// Dynamic color is available on Android 12+
dynamicColor: Boolean = true,
content: @Composable () -> Unit
) {
val colorScheme = when {
dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> {
val context = LocalContext.current
if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context)
}
darkTheme -> DarkColorScheme
else -> LightColorScheme
}
val view = LocalView.current
if (!view.isInEditMode) {
SideEffect {
val window = (view.context as Activity).window
window.statusBarColor = colorScheme.primary.toArgb()
WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
}
}
MaterialTheme(
colorScheme = colorScheme,
typography = Typography,
content = content
)
}

View File

@@ -1,34 +0,0 @@
package com.example.llama.ui.theme
import androidx.compose.material3.Typography
import androidx.compose.ui.text.TextStyle
import androidx.compose.ui.text.font.FontFamily
import androidx.compose.ui.text.font.FontWeight
import androidx.compose.ui.unit.sp
// Set of Material typography styles to start with
val Typography = Typography(
bodyLarge = TextStyle(
fontFamily = FontFamily.Default,
fontWeight = FontWeight.Normal,
fontSize = 16.sp,
lineHeight = 24.sp,
letterSpacing = 0.5.sp
)
/* Other default text styles to override
titleLarge = TextStyle(
fontFamily = FontFamily.Default,
fontWeight = FontWeight.Normal,
fontSize = 22.sp,
lineHeight = 28.sp,
letterSpacing = 0.sp
),
labelSmall = TextStyle(
fontFamily = FontFamily.Default,
fontWeight = FontWeight.Medium,
fontSize = 11.sp,
lineHeight = 16.sp,
letterSpacing = 0.5.sp
)
*/
)

View File

@@ -1,170 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<vector xmlns:android="http://schemas.android.com/apk/res/android"
android:width="108dp"
android:height="108dp"
android:viewportWidth="108"
android:viewportHeight="108">
<path
android:fillColor="#3DDC84"
android:pathData="M0,0h108v108h-108z" />
<path
android:fillColor="#00000000"
android:pathData="M9,0L9,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,0L19,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M29,0L29,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M39,0L39,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M49,0L49,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M59,0L59,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M69,0L69,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M79,0L79,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M89,0L89,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M99,0L99,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,9L108,9"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,19L108,19"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,29L108,29"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,39L108,39"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,49L108,49"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,59L108,59"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,69L108,69"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,79L108,79"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,89L108,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,99L108,99"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,29L89,29"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,39L89,39"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,49L89,49"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,59L89,59"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,69L89,69"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,79L89,79"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M29,19L29,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M39,19L39,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M49,19L49,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M59,19L59,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M69,19L69,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M79,19L79,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
</vector>

View File

@@ -1,30 +0,0 @@
<vector xmlns:android="http://schemas.android.com/apk/res/android"
xmlns:aapt="http://schemas.android.com/aapt"
android:width="108dp"
android:height="108dp"
android:viewportWidth="108"
android:viewportHeight="108">
<path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
<aapt:attr name="android:fillColor">
<gradient
android:endX="85.84757"
android:endY="92.4963"
android:startX="42.9492"
android:startY="49.59793"
android:type="linear">
<item
android:color="#44000000"
android:offset="0.0" />
<item
android:color="#00000000"
android:offset="1.0" />
</gradient>
</aapt:attr>
</path>
<path
android:fillColor="#FFFFFF"
android:fillType="nonZero"
android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
android:strokeWidth="1"
android:strokeColor="#00000000" />
</vector>

View File

@@ -1,6 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
<background android:drawable="@drawable/ic_launcher_background" />
<foreground android:drawable="@drawable/ic_launcher_foreground" />
<monochrome android:drawable="@drawable/ic_launcher_foreground" />
</adaptive-icon>

View File

@@ -1,6 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
<background android:drawable="@drawable/ic_launcher_background" />
<foreground android:drawable="@drawable/ic_launcher_foreground" />
<monochrome android:drawable="@drawable/ic_launcher_foreground" />
</adaptive-icon>

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 982 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.6 KiB

View File

@@ -1,10 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<resources>
<color name="purple_200">#FFBB86FC</color>
<color name="purple_500">#FF6200EE</color>
<color name="purple_700">#FF3700B3</color>
<color name="teal_200">#FF03DAC5</color>
<color name="teal_700">#FF018786</color>
<color name="black">#FF000000</color>
<color name="white">#FFFFFFFF</color>
</resources>

View File

@@ -1,3 +0,0 @@
<resources>
<string name="app_name">LlamaAndroid</string>
</resources>

View File

@@ -1,5 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<resources>
<style name="Theme.LlamaAndroid" parent="android:Theme.Material.Light.NoActionBar" />
</resources>

View File

@@ -1,13 +0,0 @@
<?xml version="1.0" encoding="utf-8"?><!--
Sample backup rules file; uncomment and customize as necessary.
See https://developer.android.com/guide/topics/data/autobackup
for details.
Note: This file is ignored for devices older that API 31
See https://developer.android.com/about/versions/12/backup-restore
-->
<full-backup-content>
<!--
<include domain="sharedpref" path="."/>
<exclude domain="sharedpref" path="device.xml"/>
-->
</full-backup-content>

View File

@@ -1,19 +0,0 @@
<?xml version="1.0" encoding="utf-8"?><!--
Sample data extraction rules file; uncomment and customize as necessary.
See https://developer.android.com/about/versions/12/backup-restore#xml-changes
for details.
-->
<data-extraction-rules>
<cloud-backup>
<!-- TODO: Use <include> and <exclude> to control what is backed up.
<include .../>
<exclude .../>
-->
</cloud-backup>
<!--
<device-transfer>
<include .../>
<exclude .../>
</device-transfer>
-->
</data-extraction-rules>

View File

@@ -1,5 +0,0 @@
// Top-level build file where you can add configuration options common to all sub-projects/modules.
plugins {
id("com.android.application") version "8.2.0" apply false
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
}

View File

@@ -1,23 +0,0 @@
# Project-wide Gradle settings.
# IDE (e.g. Android Studio) users:
# Gradle settings configured through the IDE *will override*
# any settings specified in this file.
# For more details on how to configure your build environment visit
# http://www.gradle.org/docs/current/userguide/build_environment.html
# Specifies the JVM arguments used for the daemon process.
# The setting is particularly useful for tweaking memory settings.
org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
# When configured, Gradle will run in incubating parallel mode.
# This option should only be used with decoupled projects. More details, visit
# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
# org.gradle.parallel=true
# AndroidX package structure to make it clearer which packages are bundled with the
# Android operating system, and which are packaged with your app's APK
# https://developer.android.com/topic/libraries/support-library/androidx-rn
android.useAndroidX=true
# Kotlin code style for this project: "official" or "obsolete":
kotlin.code.style=official
# Enables namespacing of each library's R class so that its R class includes only the
# resources declared in the library itself and none from the library's dependencies,
# thereby reducing the size of the R class for that library
android.nonTransitiveRClass=true

View File

@@ -1,6 +0,0 @@
#Thu Dec 21 14:31:09 AEDT 2023
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists

View File

@@ -1,185 +0,0 @@
#!/usr/bin/env sh
#
# Copyright 2015 the original author or authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
##############################################################################
##
## Gradle start up script for UN*X
##
##############################################################################
# Attempt to set APP_HOME
# Resolve links: $0 may be a link
PRG="$0"
# Need this for relative symlinks.
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG=`dirname "$PRG"`"/$link"
fi
done
SAVED="`pwd`"
cd "`dirname \"$PRG\"`/" >/dev/null
APP_HOME="`pwd -P`"
cd "$SAVED" >/dev/null
APP_NAME="Gradle"
APP_BASE_NAME=`basename "$0"`
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD="maximum"
warn () {
echo "$*"
}
die () {
echo
echo "$*"
echo
exit 1
}
# OS specific support (must be 'true' or 'false').
cygwin=false
msys=false
darwin=false
nonstop=false
case "`uname`" in
CYGWIN* )
cygwin=true
;;
Darwin* )
darwin=true
;;
MINGW* )
msys=true
;;
NONSTOP* )
nonstop=true
;;
esac
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
# Determine the Java command to use to start the JVM.
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD="$JAVA_HOME/jre/sh/java"
else
JAVACMD="$JAVA_HOME/bin/java"
fi
if [ ! -x "$JAVACMD" ] ; then
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
else
JAVACMD="java"
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
# Increase the maximum file descriptors if we can.
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
MAX_FD_LIMIT=`ulimit -H -n`
if [ $? -eq 0 ] ; then
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
MAX_FD="$MAX_FD_LIMIT"
fi
ulimit -n $MAX_FD
if [ $? -ne 0 ] ; then
warn "Could not set maximum file descriptor limit: $MAX_FD"
fi
else
warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
fi
fi
# For Darwin, add options to specify how the application appears in the dock
if $darwin; then
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
fi
# For Cygwin or MSYS, switch paths to Windows format before running java
if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
JAVACMD=`cygpath --unix "$JAVACMD"`
# We build the pattern for arguments to be converted via cygpath
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
SEP=""
for dir in $ROOTDIRSRAW ; do
ROOTDIRS="$ROOTDIRS$SEP$dir"
SEP="|"
done
OURCYGPATTERN="(^($ROOTDIRS))"
# Add a user-defined pattern to the cygpath arguments
if [ "$GRADLE_CYGPATTERN" != "" ] ; then
OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
fi
# Now convert the arguments - kludge to limit ourselves to /bin/sh
i=0
for arg in "$@" ; do
CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
else
eval `echo args$i`="\"$arg\""
fi
i=`expr $i + 1`
done
case $i in
0) set -- ;;
1) set -- "$args0" ;;
2) set -- "$args0" "$args1" ;;
3) set -- "$args0" "$args1" "$args2" ;;
4) set -- "$args0" "$args1" "$args2" "$args3" ;;
5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
esac
fi
# Escape application args
save () {
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
echo " "
}
APP_ARGS=`save "$@"`
# Collect all arguments for the java command, following the shell quoting and substitution rules
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
exec "$JAVACMD" "$@"

View File

@@ -1,17 +0,0 @@
pluginManagement {
repositories {
google()
mavenCentral()
gradlePluginPortal()
}
}
dependencyResolutionManagement {
repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
repositories {
google()
mavenCentral()
}
}
rootProject.name = "LlamaAndroid"
include(":app")

View File

@@ -1,12 +1,7 @@
# llama.cpp/examples/llama.swiftui
# llama.swiftui
Local inference of llama.cpp on an iPhone. This is a sample app that can be used as a starting
point for more advanced projects.
For usage instructions and performance stats, check the following discussion: https://github.com/ggerganov/llama.cpp/discussions/4508
![image](https://github.com/ggerganov/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)
Video demonstration:
Local inference of llama.cpp on an iPhone.
So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well.
https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545

View File

@@ -1,5 +1,6 @@
import Foundation
import llama
// import llama
enum LlamaError: Error {
case couldNotInitializeContext
@@ -158,7 +159,7 @@ actor LlamaContext {
new_token_id = llama_sample_token_greedy(context, &candidates_p)
}
if new_token_id == llama_token_eos(model) || n_cur == n_len {
if new_token_id == llama_token_eos(context) || n_cur == n_len {
print("\n")
let new_token_str = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll()

View File

@@ -0,0 +1,5 @@
//
// Use this file to import your target's public headers that you would like to expose to Swift.
//
#import "llama.h"

View File

@@ -7,34 +7,51 @@
objects = {
/* Begin PBXBuildFile section */
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
DF810E132B4A5BA200301144 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = DF810E122B4A5BA200301144 /* llama */; };
F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputButton.swift; sourceTree = "<group>"; };
7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
DF2D2FE72B4A59BE00FCB72D /* llama.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = llama.cpp; path = ../..; sourceTree = "<group>"; };
F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LoadCustomButton.swift; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@@ -42,7 +59,6 @@
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
DF810E132B4A5BA200301144 /* llama in Frameworks */,
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
);
@@ -51,10 +67,30 @@
/* End PBXFrameworksBuildPhase section */
/* Begin PBXGroup section */
8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
isa = PBXGroup;
children = (
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
542376092B0D9C40008E6A1C /* ggml-backend.h */,
542376062B0D9BEA008E6A1C /* ggml-quants.h */,
542376072B0D9BFB008E6A1C /* ggml-quants.c */,
549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
549479C62AC9E0F200E0F78B /* ggml-metal.h */,
549479C52AC9E0F200E0F78B /* ggml-metal.m */,
542EA09B2AC8723900A8AEE9 /* ggml.c */,
542EA09C2AC8723900A8AEE9 /* ggml.h */,
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
542EA0A12AC8729100A8AEE9 /* llama.cpp */,
542EA0A22AC8729100A8AEE9 /* llama.h */,
);
name = llama.cpp;
sourceTree = "<group>";
};
8A1C836A2AC328BD0096AF73 = {
isa = PBXGroup;
children = (
DF2D2FE72B4A59BE00FCB72D /* llama.cpp */,
8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
8A907F312AC7134E006146EA /* llama.cpp.swift */,
8A3F84232AC4C891005E2EE8 /* models */,
8A1C83752AC328BD0096AF73 /* llama.swiftui */,
@@ -79,10 +115,19 @@
8A9F7C4A2AC332BF008AE1EA /* UI */,
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
8A1C837C2AC328BE0096AF73 /* Preview Content */,
);
path = llama.swiftui;
sourceTree = "<group>";
};
8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
isa = PBXGroup;
children = (
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
);
path = "Preview Content";
sourceTree = "<group>";
};
8A39BE082AC7601000BFEB40 /* Frameworks */ = {
isa = PBXGroup;
children = (
@@ -110,6 +155,7 @@
8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
isa = PBXGroup;
children = (
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
8A907F322AC7134E006146EA /* LibLlama.swift */,
);
path = llama.cpp.swift;
@@ -120,8 +166,6 @@
children = (
7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
8A1C83782AC328BD0096AF73 /* ContentView.swift */,
F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */,
79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */,
);
path = UI;
sourceTree = "<group>";
@@ -151,7 +195,6 @@
);
name = llama.swiftui;
packageProductDependencies = (
DF810E122B4A5BA200301144 /* llama */,
);
productName = llama.swiftui;
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
@@ -198,7 +241,9 @@
isa = PBXResourcesBuildPhase;
buildActionMask = 2147483647;
files = (
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
8A3F84242AC4C891005E2EE8 /* models in Resources */,
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
);
runOnlyForDeploymentPostprocessing = 0;
@@ -210,13 +255,17 @@
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */,
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */,
79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */,
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@@ -346,10 +395,12 @@
isa = XCBuildConfiguration;
buildSettings = {
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
CLANG_ENABLE_MODULES = YES;
CODE_SIGN_STYLE = Automatic;
CURRENT_PROJECT_VERSION = 1;
DEVELOPMENT_TEAM = K5UQJPP73A;
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
DEVELOPMENT_TEAM = STLSG3FG8Q;
ENABLE_PREVIEWS = YES;
GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
@@ -365,12 +416,11 @@
MARKETING_VERSION = 1.0;
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
PRODUCT_NAME = "$(TARGET_NAME)";
SUPPORTED_PLATFORMS = "iphoneos iphonesimulator xros xrsimulator";
SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
SWIFT_EMIT_LOC_STRINGS = YES;
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
SWIFT_VERSION = 5.0;
TARGETED_DEVICE_FAMILY = "1,2,7";
TARGETED_DEVICE_FAMILY = "1,2";
};
name = Debug;
};
@@ -378,10 +428,12 @@
isa = XCBuildConfiguration;
buildSettings = {
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
CLANG_ENABLE_MODULES = YES;
CODE_SIGN_STYLE = Automatic;
CURRENT_PROJECT_VERSION = 1;
DEVELOPMENT_TEAM = K5UQJPP73A;
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
DEVELOPMENT_TEAM = STLSG3FG8Q;
ENABLE_PREVIEWS = YES;
GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
@@ -397,11 +449,10 @@
MARKETING_VERSION = 1.0;
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
PRODUCT_NAME = "$(TARGET_NAME)";
SUPPORTED_PLATFORMS = "iphoneos iphonesimulator xros xrsimulator";
SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
SWIFT_EMIT_LOC_STRINGS = YES;
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
SWIFT_VERSION = 5.0;
TARGETED_DEVICE_FAMILY = "1,2,7";
TARGETED_DEVICE_FAMILY = "1,2";
};
name = Release;
};
@@ -427,13 +478,6 @@
defaultConfigurationName = Release;
};
/* End XCConfigurationList section */
/* Begin XCSwiftPackageProductDependency section */
DF810E122B4A5BA200301144 /* llama */ = {
isa = XCSwiftPackageProductDependency;
productName = llama;
};
/* End XCSwiftPackageProductDependency section */
};
rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
}

View File

@@ -0,0 +1,11 @@
{
"colors" : [
{
"idiom" : "universal"
}
],
"info" : {
"author" : "xcode",
"version" : 1
}
}

View File

@@ -1,20 +1,9 @@
import Foundation
struct Model: Identifiable {
var id = UUID()
var name: String
var url: String
var filename: String
var status: String?
}
@MainActor
class LlamaState: ObservableObject {
@Published var messageLog = ""
@Published var cacheCleared = false
@Published var downloadedModels: [Model] = []
@Published var undownloadedModels: [Model] = []
let NS_PER_S = 1_000_000_000.0
private var llamaContext: LlamaContext?
private var defaultModelUrl: URL? {
@@ -23,130 +12,37 @@ class LlamaState: ObservableObject {
}
init() {
loadModelsFromDisk()
loadDefaultModels()
}
private func loadModelsFromDisk() {
do {
let documentsURL = getDocumentsDirectory()
let modelURLs = try FileManager.default.contentsOfDirectory(at: documentsURL, includingPropertiesForKeys: nil, options: [.skipsHiddenFiles, .skipsSubdirectoryDescendants])
for modelURL in modelURLs {
let modelName = modelURL.deletingPathExtension().lastPathComponent
downloadedModels.append(Model(name: modelName, url: "", filename: modelURL.lastPathComponent, status: "downloaded"))
}
} catch {
print("Error loading models from disk: \(error)")
}
}
private func loadDefaultModels() {
do {
try loadModel(modelUrl: defaultModelUrl)
} catch {
messageLog += "Error!\n"
}
for model in defaultModels {
let fileURL = getDocumentsDirectory().appendingPathComponent(model.filename)
if FileManager.default.fileExists(atPath: fileURL.path) {
} else {
var undownloadedModel = model
undownloadedModel.status = "download"
undownloadedModels.append(undownloadedModel)
}
}
}
func getDocumentsDirectory() -> URL {
let paths = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)
return paths[0]
}
private let defaultModels: [Model] = [
Model(name: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",url: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf", status: "download"),
Model(
name: "TinyLlama-1.1B Chat (Q8_0, 1.1 GiB)",
url: "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q8_0.gguf?download=true",
filename: "tinyllama-1.1b-chat-v1.0.Q8_0.gguf", status: "download"
),
Model(
name: "TinyLlama-1.1B (F16, 2.2 GiB)",
url: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
filename: "tinyllama-1.1b-f16.gguf", status: "download"
),
Model(
name: "Phi-2.7B (Q4_0, 1.6 GiB)",
url: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
filename: "phi-2-q4_0.gguf", status: "download"
),
Model(
name: "Phi-2.7B (Q8_0, 2.8 GiB)",
url: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
filename: "phi-2-q8_0.gguf", status: "download"
),
Model(
name: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)",
url: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
filename: "mistral-7b-v0.1.Q4_0.gguf", status: "download"
),
Model(
name: "OpenHermes-2.5-Mistral-7B (Q3_K_M, 3.52 GiB)",
url: "https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q3_K_M.gguf?download=true",
filename: "openhermes-2.5-mistral-7b.Q3_K_M.gguf", status: "download"
)
]
func loadModel(modelUrl: URL?) throws {
messageLog += "Loading model...\n"
if let modelUrl {
messageLog += "Loading model...\n"
llamaContext = try LlamaContext.create_context(path: modelUrl.path())
messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
// Assuming that the model is successfully loaded, update the downloaded models
updateDownloadedModels(modelName: modelUrl.lastPathComponent, status: "downloaded")
} else {
messageLog += "Load a model from the list below\n"
messageLog += "Could not locate model\n"
}
}
private func updateDownloadedModels(modelName: String, status: String) {
undownloadedModels.removeAll { $0.name == modelName }
}
func complete(text: String) async {
guard let llamaContext else {
return
}
let t_start = DispatchTime.now().uptimeNanoseconds
await llamaContext.completion_init(text: text)
let t_heat_end = DispatchTime.now().uptimeNanoseconds
let t_heat = Double(t_heat_end - t_start) / NS_PER_S
messageLog += "\(text)"
while await llamaContext.n_cur < llamaContext.n_len {
while await llamaContext.n_cur <= llamaContext.n_len {
let result = await llamaContext.completion_loop()
messageLog += "\(result)"
}
let t_end = DispatchTime.now().uptimeNanoseconds
let t_generation = Double(t_end - t_heat_end) / NS_PER_S
let tokens_per_second = Double(await llamaContext.n_len) / t_generation
await llamaContext.clear()
messageLog += """
\n
Done
Heat up took \(t_heat)s
Generated \(tokens_per_second) t/s\n
"""
messageLog += "\n\ndone\n"
}
func bench() async {
@@ -160,10 +56,10 @@ class LlamaState: ObservableObject {
messageLog += await llamaContext.model_info() + "\n"
let t_start = DispatchTime.now().uptimeNanoseconds
let _ = await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
let t_end = DispatchTime.now().uptimeNanoseconds
let t_heat = Double(t_end - t_start) / NS_PER_S
let t_heat = Double(t_end - t_start) / 1_000_000_000.0
messageLog += "Heat up time: \(t_heat) seconds, please wait...\n"
// if more than 5 seconds, then we're probably running on a slow device

View File

@@ -0,0 +1,6 @@
{
"info" : {
"author" : "xcode",
"version" : 1
}
}

View File

@@ -2,57 +2,138 @@ import SwiftUI
struct ContentView: View {
@StateObject var llamaState = LlamaState()
@State private var multiLineText = ""
@State private var showingHelp = false // To track if Help Sheet should be shown
private static func cleanupModelCaches() {
// Delete all models (*.gguf)
let fileManager = FileManager.default
let documentsUrl = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
do {
let fileURLs = try fileManager.contentsOfDirectory(at: documentsUrl, includingPropertiesForKeys: nil)
for fileURL in fileURLs {
if fileURL.pathExtension == "gguf" {
try fileManager.removeItem(at: fileURL)
}
}
} catch {
print("Error while enumerating files \(documentsUrl.path): \(error.localizedDescription)")
}
}
var body: some View {
NavigationView {
VStack {
ScrollView(.vertical, showsIndicators: true) {
Text(llamaState.messageLog)
.font(.system(size: 12))
.frame(maxWidth: .infinity, alignment: .leading)
.padding()
.onTapGesture {
UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
}
}
TextEditor(text: $multiLineText)
.frame(height: 80)
.padding()
.border(Color.gray, width: 0.5)
HStack {
Button("Send") {
sendText()
}
Button("Bench") {
bench()
}
Button("Clear") {
clear()
}
Button("Copy") {
UIPasteboard.general.string = llamaState.messageLog
}
}
.buttonStyle(.bordered)
VStack {
ScrollView(.vertical, showsIndicators: true) {
Text(llamaState.messageLog)
.font(.system(size: 12))
.frame(maxWidth: .infinity, alignment: .leading)
.padding()
NavigationLink(destination: DrawerView(llamaState: llamaState)) {
Text("View Models")
.onTapGesture {
UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
}
.padding()
}
.padding()
.navigationBarTitle("Model Settings", displayMode: .inline)
TextEditor(text: $multiLineText)
.frame(height: 80)
.padding()
.border(Color.gray, width: 0.5)
HStack {
Button("Send") {
sendText()
}
.padding(8)
.background(Color.blue)
.foregroundColor(.white)
.cornerRadius(8)
Button("Bench") {
bench()
}
.padding(8)
.background(Color.blue)
.foregroundColor(.white)
.cornerRadius(8)
Button("Clear") {
clear()
}
.padding(8)
.background(Color.blue)
.foregroundColor(.white)
.cornerRadius(8)
Button("Copy") {
UIPasteboard.general.string = llamaState.messageLog
}
.padding(8)
.background(Color.blue)
.foregroundColor(.white)
.cornerRadius(8)
}
VStack {
DownloadButton(
llamaState: llamaState,
modelName: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",
modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
)
.font(.system(size: 12))
.padding(.top, 4)
.frame(maxWidth: .infinity, alignment: .leading)
DownloadButton(
llamaState: llamaState,
modelName: "TinyLlama-1.1B (Q8_0, 1.1 GiB)",
modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true",
filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf"
)
.font(.system(size: 12))
DownloadButton(
llamaState: llamaState,
modelName: "TinyLlama-1.1B (F16, 2.2 GiB)",
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
filename: "tinyllama-1.1b-f16.gguf"
)
.font(.system(size: 12))
.frame(maxWidth: .infinity, alignment: .leading)
DownloadButton(
llamaState: llamaState,
modelName: "Phi-2.7B (Q4_0, 1.6 GiB)",
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
filename: "phi-2-q4_0.gguf"
)
.font(.system(size: 12))
DownloadButton(
llamaState: llamaState,
modelName: "Phi-2.7B (Q8_0, 2.8 GiB)",
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
filename: "phi-2-q8_0.gguf"
)
.font(.system(size: 12))
.frame(maxWidth: .infinity, alignment: .leading)
DownloadButton(
llamaState: llamaState,
modelName: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)",
modelUrl: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
filename: "mistral-7b-v0.1.Q4_0.gguf"
)
.font(.system(size: 12))
Button("Clear downloaded models") {
ContentView.cleanupModelCaches()
llamaState.cacheCleared = true
}
.padding(8)
.font(.system(size: 12))
}
}
.padding()
}
func sendText() {
@@ -73,73 +154,8 @@ struct ContentView: View {
await llamaState.clear()
}
}
struct DrawerView: View {
@ObservedObject var llamaState: LlamaState
@State private var showingHelp = false
func delete(at offsets: IndexSet) {
offsets.forEach { offset in
let model = llamaState.downloadedModels[offset]
let fileURL = getDocumentsDirectory().appendingPathComponent(model.filename)
do {
try FileManager.default.removeItem(at: fileURL)
} catch {
print("Error deleting file: \(error)")
}
}
// Remove models from downloadedModels array
llamaState.downloadedModels.remove(atOffsets: offsets)
}
func getDocumentsDirectory() -> URL {
let paths = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)
return paths[0]
}
var body: some View {
List {
Section(header: Text("Download Models From Hugging Face")) {
HStack {
InputButton(llamaState: llamaState)
}
}
Section(header: Text("Downloaded Models")) {
ForEach(llamaState.downloadedModels) { model in
DownloadButton(llamaState: llamaState, modelName: model.name, modelUrl: model.url, filename: model.filename)
}
.onDelete(perform: delete)
}
Section(header: Text("Default Models")) {
ForEach(llamaState.undownloadedModels) { model in
DownloadButton(llamaState: llamaState, modelName: model.name, modelUrl: model.url, filename: model.filename)
}
}
}
.listStyle(GroupedListStyle())
.navigationBarTitle("Model Settings", displayMode: .inline).toolbar {
ToolbarItem(placement: .navigationBarTrailing) {
Button("Help") {
showingHelp = true
}
}
}.sheet(isPresented: $showingHelp) { // Sheet for help modal
VStack(alignment: .leading) {
VStack(alignment: .leading) {
Text("1. Make sure the model is in GGUF Format")
.padding()
Text("2. Copy the download link of the quantized model")
.padding()
}
Spacer()
}
}
}
}
}
struct ContentView_Previews: PreviewProvider {
static var previews: some View {
ContentView()
}
}
//#Preview {
// ContentView()
//}

View File

@@ -53,8 +53,6 @@ struct DownloadButton: View {
llamaState.cacheCleared = false
let model = Model(name: modelName, url: modelUrl, filename: filename, status: "downloaded")
llamaState.downloadedModels.append(model)
status = "downloaded"
}
} catch let err {
@@ -95,7 +93,7 @@ struct DownloadButton: View {
print("Error: \(err.localizedDescription)")
}
}) {
Text("Load \(modelName)")
Text("\(modelName) (Downloaded)")
}
} else {
Text("Unknown status")

Some files were not shown because too many files have changed in this diff Show More