Compare commits

..

4 Commits

Author SHA1 Message Date
Georgi Gerganov
608f449880 swift : fix build
ggml-ci
2024-02-23 19:02:09 +02:00
Georgi Gerganov
fff1e8a54a batched.swift : fix build
ggml-ci
2024-02-23 16:15:37 +02:00
Georgi Gerganov
8772658b11 ggml : add I32 <-> F32 conversion
ggml-ci
2024-02-23 14:25:05 +02:00
Georgi Gerganov
fc775366f1 llama : switch to floating-point token positions
ggml-ci
2024-02-23 12:34:16 +02:00
261 changed files with 74345 additions and 95087 deletions

View File

@@ -12,7 +12,6 @@ Checks: >
-readability-implicit-bool-conversion,
-readability-magic-numbers,
-readability-uppercase-literal-suffix,
-readability-simplify-boolean-expr,
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,

View File

@@ -26,8 +26,8 @@ COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV LLAMA_CUDA=1
# Enable cuBLAS
ENV LLAMA_CUBLAS=1
RUN make

View File

@@ -12,7 +12,7 @@
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
# It is up to the user to install the correct vendor-specific support.
Name: llama.cpp-cuda
Name: llama.cpp-cublas
Version: %( date "+%%Y%%m%%d" )
Release: 1%{?dist}
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
@@ -32,16 +32,16 @@ CPU inference for Meta's Lllama2 models using default options.
%setup -n llama.cpp-master
%build
make -j LLAMA_CUDA=1
make -j LLAMA_CUBLAS=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llamacppcuda
cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
cp -p main %{buildroot}%{_bindir}/llamacppcublas
cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacublas.service
[Unit]
Description=Llama.cpp server, CPU only (no GPU support in this build).
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
@@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
[Service]
Type=simple
EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID
Restart=never
@@ -67,10 +67,10 @@ rm -rf %{buildroot}
rm -rf %{_builddir}/*
%files
%{_bindir}/llamacppcuda
%{_bindir}/llamacppcudaserver
%{_bindir}/llamacppcudasimple
/usr/lib/systemd/system/llamacuda.service
%{_bindir}/llamacppcublas
%{_bindir}/llamacppcublasserver
%{_bindir}/llamacppcublassimple
/usr/lib/systemd/system/llamacublas.service
%config /etc/sysconfig/llama
%pre

View File

@@ -20,8 +20,8 @@ COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV LLAMA_CUDA=1
# Enable cuBLAS
ENV LLAMA_CUBLAS=1
RUN make

View File

@@ -1,17 +1,15 @@
{
lib,
glibc,
config,
stdenv,
mkShell,
runCommand,
cmake,
ninja,
pkg-config,
git,
python3,
mpi,
blas,
openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
cudaPackages,
darwin,
rocmPackages,
@@ -24,7 +22,7 @@
useOpenCL
useRocm
useVulkan
] && blas.meta.available,
],
useCuda ? config.cudaSupport,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
useMpi ? false, # Increases the runtime closure size by ~700M
@@ -32,12 +30,6 @@
useRocm ? config.rocmSupport,
useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
# It's necessary to consistently use backendStdenv when building with CUDA support,
# otherwise we get libstdc++ errors downstream.
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
precompileMetalShaders ? false
}@inputs:
let
@@ -49,7 +41,10 @@ let
versionOlder
;
# It's necessary to consistently use backendStdenv when building with CUDA support,
# otherwise we get libstdc++ errors downstream.
stdenv = throw "Use effectiveStdenv instead";
effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
suffices =
lib.optionals useBlas [ "BLAS" ]
@@ -67,15 +62,10 @@ let
strings.optionalString (suffices != [ ])
", accelerated with ${strings.concatStringsSep ", " suffices}";
executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
# TODO: package the Python in this repository in a Nix-like way.
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
# https://peps.python.org/pep-0517/
#
# TODO: Package up each Python script or service appropriately, by making
# them into "entrypoints"
llama-python = python3.withPackages (
ps: [
ps.numpy
@@ -94,11 +84,6 @@ let
]
);
xcrunHost = runCommand "xcrunHost" {} ''
mkdir -p $out/bin
ln -s /usr/bin/xcrun $out/bin
'';
# apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
# separately
darwinBuildInputs =
@@ -162,17 +147,12 @@ effectiveStdenv.mkDerivation (
postPatch = ''
substituteInPlace ./ggml-metal.m \
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
substituteInPlace ./ggml-metal.m \
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
'';
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
# `default.metallib` may be compiled with Metal compiler from XCode
# and we need to escape sandbox on MacOS to access Metal compiler.
# `xcrun` is used find the path of the Metal compiler, which is varible
# and not on $PATH
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
# TODO: Package up each Python script or service appropriately.
# If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
# we could make those *.py into setuptools' entrypoints
substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
'';
nativeBuildInputs =
[
@@ -187,11 +167,6 @@ effectiveStdenv.mkDerivation (
# TODO: Replace with autoAddDriverRunpath
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
cudaPackages.autoAddOpenGLRunpathHook
]
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
glibc.static
] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
xcrunHost
];
buildInputs =
@@ -200,23 +175,21 @@ effectiveStdenv.mkDerivation (
++ optionals useMpi [ mpi ]
++ optionals useOpenCL [ clblast ]
++ optionals useRocm rocmBuildInputs
++ optionals useBlas [ blas ]
++ optionals useVulkan vulkanBuildInputs;
cmakeFlags =
[
(cmakeBool "LLAMA_NATIVE" false)
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "BUILD_SHARED_LIBS" true)
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_BLAS" useBlas)
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
(cmakeBool "LLAMA_CUDA" useCuda)
(cmakeBool "LLAMA_CUBLAS" useCuda)
(cmakeBool "LLAMA_HIPBLAS" useRocm)
(cmakeBool "LLAMA_METAL" useMetalKit)
(cmakeBool "LLAMA_MPI" useMpi)
(cmakeBool "LLAMA_VULKAN" useVulkan)
(cmakeBool "LLAMA_STATIC" enableStatic)
]
++ optionals useCuda [
(
@@ -236,16 +209,14 @@ effectiveStdenv.mkDerivation (
# Should likely use `rocmPackages.clr.gpuTargets`.
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
]
++ optionals useMetalKit [
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
(cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
];
++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
# if they haven't been added yet.
postInstall = ''
mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
mv $out/bin/main $out/bin/llama
mv $out/bin/server $out/bin/llama-server
mkdir -p $out/include
cp $src/llama.h $out/include/
'';

View File

@@ -7,7 +7,7 @@
}:
let
optionalInt = cond: x: if cond then x else 0;
optionalInt = cond: x: if cond then x else 0;
in
singularity-tools.buildImage rec {
inherit (llama-cpp) name;

View File

@@ -20,8 +20,8 @@ COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV LLAMA_CUDA=1
# Enable cuBLAS
ENV LLAMA_CUBLAS=1
RUN make

View File

@@ -7,5 +7,3 @@ assignees: ''
---
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).

View File

@@ -1,280 +0,0 @@
# Benchmark
name: Benchmark
on:
workflow_dispatch:
inputs:
gpu-series:
description: 'Azure GPU series to run with'
required: true
type: choice
options:
- Standard_NC4as_T4_v3
- Standard_NC24ads_A100_v4
- Standard_NC80adis_H100_v5
sha:
description: 'Commit SHA1 to build'
required: false
type: string
duration:
description: 'Duration of the bench'
type: string
default: 10m
push:
branches:
- master
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
schedule:
- cron: '04 2 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
bench-server-baseline:
runs-on: Standard_NC4as_T4_v3
env:
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
N_USERS: 8
DURATION: 10m
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Install python env
id: pipenv
run: |
cd examples/server/bench
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
- name: Prometheus
id: install_prometheus
run: |
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
tar xzf prometheus*.tar.gz --strip-components=1
./prometheus --config.file=examples/server/bench/prometheus.yml &
while ! nc -z localhost 9090; do
sleep 0.1
done
- name: Install k6
id: k6_installation
run: |
cd examples/server/bench
wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
tar xzf k6*.tar.gz --strip-components=1
- name: Build
id: cmake_build
run: |
set -eux
mkdir build
cd build
cmake .. \
-DLLAMA_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DLLAMA_CUBLAS=ON \
-DCUDAToolkit_ROOT=/usr/local/cuda \
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
-DCMAKE_CUDA_ARCHITECTURES=75 \
-DLLAMA_FATAL_WARNINGS=OFF \
-DLLAMA_ALL_WARNINGS=OFF \
-DCMAKE_BUILD_TYPE=Release;
cmake --build . --config Release -j $(nproc) --target server
- name: Download the dataset
id: download_dataset
run: |
cd examples/server/bench
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
- name: Server bench
id: server_bench
run: |
set -eux
cd examples/server/bench
source venv/bin/activate
BENCH_K6_BIN_PATH=./k6 python bench.py \
--runner-label ${{ env.RUNNER_LABEL }} \
--name ${{ github.job }} \
--branch ${{ github.head_ref || github.ref_name }} \
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
--scenario script.js \
--duration ${{ github.event.inputs.duration || env.DURATION }} \
--hf-repo ggml-org/models \
--hf-file phi-2/ggml-model-q4_0.gguf \
--model-path-prefix /models \
--parallel ${{ env.N_USERS }} \
-ngl 33 \
--batch-size 2048 \
--ubatch-size 256 \
--ctx-size 16384 \
--n-prompts 1000 \
--max-prompt-tokens 1024 \
--max-tokens 2048
cat results.github.env >> $GITHUB_ENV
# Remove dataset as we do not want it in the artefact
rm ShareGPT_V3_unfiltered_cleaned_split.json
- uses: actions/upload-artifact@v4
with:
name: benchmark-results
compression-level: 9
path: |
examples/server/bench/*.jpg
examples/server/bench/*.json
examples/server/bench/*.log
- name: Commit status
uses: Sibz/github-status-action@v1
continue-on-error: true # If not authorized on external repo
with:
authToken: ${{secrets.GITHUB_TOKEN}}
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
context: bench-server-baseline
description: |
${{ env.BENCH_RESULTS }}
state: 'success'
- name: Upload benchmark images
uses: devicons/public-upload-to-imgur@v2.2.2
continue-on-error: true # Important as it looks unstable: 503
id: imgur_step
with:
client_id: ${{secrets.IMGUR_CLIENT_ID}}
path: |
examples/server/bench/prompt_tokens_seconds.jpg
examples/server/bench/predicted_tokens_seconds.jpg
examples/server/bench/kv_cache_usage_ratio.jpg
examples/server/bench/requests_processing.jpg
- name: Extract mermaid
id: set_mermaid
run: |
set -eux
cd examples/server/bench
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
- name: Extract image url
id: extract_image_url
continue-on-error: true
run: |
set -eux
echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
- name: Comment PR
uses: mshick/add-pr-comment@v2
id: comment_pr
if: ${{ github.event.pull_request != '' }}
with:
message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
message: |
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
- ${{ env.BENCH_GRAPH_XLABEL }}
<details>
<summary>Time series</summary>
<p align="center">
<img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
<details>
<summary>More</summary>
```mermaid
${{ env.PROMPT_TOKENS_SECONDS }}
```
</details>
<img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
<details>
<summary>More</summary>
```mermaid
${{ env.PREDICTED_TOKENS_SECONDS }}
```
</details>
</p>
<details>
<summary>Details</summary>
<p align="center">
<img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
<details>
<summary>More</summary>
```mermaid
${{ env.KV_CACHE_USAGE_RATIO }}
```
</details>
<img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
<details>
<summary>More</summary>
```mermaid
${{ env.REQUESTS_PROCESSING }}
```
</details>
</p>
</details>
</details>

View File

@@ -15,133 +15,14 @@ on:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GGML_NLOOP: 3
GGML_N_THREADS: 1
jobs:
macOS-latest-cmake-arm64:
runs-on: macos-14
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
with:
path: |
llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
macOS-latest-cmake-x64:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
with:
path: |
llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
ubuntu-focal-make:
runs-on: ubuntu-20.04
env:
LLAMA_NODE_AVAILABLE: true
LLAMA_PYTHON_AVAILABLE: true
steps:
- name: Clone
@@ -154,14 +35,6 @@ jobs:
sudo apt-get update
sudo apt-get install build-essential gcc-8
- uses: actions/setup-node@v4
with:
node-version: "20"
- uses: actions/setup-python@v4
with:
python-version: "3.11"
- name: Build
id: make_build
env:
@@ -175,28 +48,6 @@ jobs:
CC=gcc-8 make tests -j $(nproc)
make test -j $(nproc)
ubuntu-focal-make-curl:
runs-on: ubuntu-20.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
- name: Build
id: make_build
env:
LLAMA_FATAL_WARNINGS: 1
LLAMA_CURL: 1
run: |
CC=gcc-8 make -j $(nproc)
ubuntu-latest-cmake:
runs-on: ubuntu-latest
@@ -225,51 +76,40 @@ jobs:
cd build
ctest -L main --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest
continue-on-error: true
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
echo "Fetch tokenizer"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
echo "Fetch llama2c model"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
# ubuntu-latest-cmake-sanitizer:
# runs-on: ubuntu-latest
#
# continue-on-error: true
#
# strategy:
# matrix:
# sanitizer: [ADDRESS, THREAD, UNDEFINED]
# build_type: [Debug, Release]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v3
#
# - name: Dependencies
# id: depends
# run: |
# sudo apt-get update
# sudo apt-get install build-essential
#
# - name: Build
# id: cmake_build
# run: |
# mkdir build
# cd build
# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
# cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
#
# - name: Test
# id: cmake_test
# run: |
# cd build
# ctest -L main --verbose --timeout 900
ctest -L main --verbose --timeout 900
ubuntu-latest-cmake-mpi:
runs-on: ubuntu-latest
@@ -305,28 +145,6 @@ jobs:
cd build
ctest -L main --verbose
ubuntu-22-cmake-vulkan:
runs-on: ubuntu-22.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libvulkan-dev
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake -DLLAMA_VULKAN=ON ..
cmake --build . --config Release -j $(nproc)
ubuntu-22-cmake-sycl:
runs-on: ubuntu-22.04
@@ -493,7 +311,6 @@ jobs:
mkdir build
cd build
cmake -G Xcode .. \
-DLLAMA_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
@@ -522,7 +339,6 @@ jobs:
mkdir build
cd build
cmake -G Xcode .. \
-DLLAMA_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
@@ -587,8 +403,6 @@ jobs:
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
- build: 'vulkan'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
- build: 'arm64'
defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
steps:
- name: Clone
@@ -684,7 +498,7 @@ jobs:
- name: Test
id: cmake_test
# not all machines have native AVX-512
if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
run: |
cd build
ctest -L main -C Release --verbose --timeout 900
@@ -728,13 +542,13 @@ jobs:
path: |
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
windows-latest-cmake-cuda:
windows-latest-cmake-cublas:
runs-on: windows-latest
strategy:
matrix:
cuda: ['12.2.0', '11.7.1']
build: ['cuda']
build: ['cublas']
steps:
- name: Clone
@@ -755,7 +569,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Determine tag name
@@ -800,7 +614,6 @@ jobs:
windows-latest-cmake-sycl:
runs-on: windows-latest
defaults:
run:
shell: bash
@@ -809,6 +622,7 @@ jobs:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
steps:
- name: Clone
id: checkout
@@ -823,32 +637,6 @@ jobs:
id: cmake_build
run: examples/sycl/win-build-sycl.bat
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
with:
path: |
llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
ios-xcode-build:
runs-on: macos-latest
@@ -881,7 +669,8 @@ jobs:
run: |
cd examples/llama.android
./gradlew build --no-daemon
# Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
./gradlew build --no-daemon -Pskip-armeabi-v7a
# freeBSD-latest:
# runs-on: macos-12
@@ -911,9 +700,7 @@ jobs:
- macOS-latest-make
- macOS-latest-cmake
- windows-latest-cmake
- windows-latest-cmake-cuda
- macOS-latest-cmake-arm64
- macOS-latest-cmake-x64
- windows-latest-cmake-cublas
steps:
- name: Clone

View File

@@ -1,23 +0,0 @@
name: Close inactive issues
on:
schedule:
- cron: "42 0 * * *"
jobs:
close-issues:
runs-on: ubuntu-latest
permissions:
issues: write
pull-requests: write
steps:
- uses: actions/stale@v5
with:
exempt-issue-labels: "refactor,help wanted,good first issue,research"
days-before-issue-stale: 30
days-before-issue-close: 14
stale-issue-label: "stale"
close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
days-before-pr-stale: -1
days-before-pr-close: -1
operations-per-run: 10000
repo-token: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -5,10 +5,6 @@ env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
run:
runs-on: ubuntu-20.04

View File

@@ -15,10 +15,6 @@ on:
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
push_to_registry:
name: Push Docker image to Docker Hub

View File

@@ -14,10 +14,6 @@ on:
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
editorconfig:
runs-on: ubuntu-latest

View File

@@ -17,10 +17,6 @@ on:
types: [opened, synchronize, reopened]
paths: ['**/*.nix', 'flake.lock']
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
nix-build-aarch64:
runs-on: ubuntu-latest

View File

@@ -8,10 +8,6 @@ on:
pull_request:
types: [opened, synchronize, reopened]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
nix-eval:
strategy:

View File

@@ -3,23 +3,17 @@ name: Python check requirements.txt
on:
push:
paths:
- '.github/workflows/python-check-requirements.yml'
- 'scripts/check-requirements.sh'
- 'convert*.py'
- 'requirements.txt'
- 'requirements/*.txt'
pull_request:
paths:
- '.github/workflows/python-check-requirements.yml'
- 'scripts/check-requirements.sh'
- 'convert*.py'
- 'requirements.txt'
- 'requirements/*.txt'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
python-check-requirements:
runs-on: ubuntu-latest
@@ -32,4 +26,4 @@ jobs:
with:
python-version: "3.11"
- name: Run check-requirements.sh script
run: bash scripts/check-requirements.sh
run: bash scripts/check-requirements.sh nocleanup

View File

@@ -2,10 +2,6 @@ name: flake8 Lint
on: [push, pull_request]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
flake8-lint:
runs-on: ubuntu-latest

View File

@@ -1,154 +0,0 @@
# Server build and tests
name: Server
on:
workflow_dispatch: # allows manual triggering
inputs:
slow_tests:
description: 'Run slow tests'
required: true
type: boolean
push:
branches:
- master
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
schedule:
- cron: '0 0 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
server:
runs-on: ubuntu-latest
strategy:
matrix:
# TODO: temporary disabled due to linux kernel issues
#sanitizer: [ADDRESS, THREAD, UNDEFINED]
sanitizer: [UNDEFINED]
build_type: [Debug]
include:
- build_type: Release
sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
container:
image: ubuntu:latest
ports:
- 8888
options: --cpus 4
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Dependencies
id: depends
run: |
apt-get update
apt-get -y install \
build-essential \
git \
cmake \
python3-pip \
wget \
language-pack-en \
libcurl4-openssl-dev
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. \
-DLLAMA_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r examples/server/tests/requirements.txt
- name: Tests
id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: |
cd examples/server/tests
PORT=8888 ./tests.sh
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd examples/server/tests
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
server-windows:
runs-on: windows-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: libCURL
id: get_libcurl
env:
CURL_VERSION: 8.6.0_6
run: |
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
mkdir $env:RUNNER_TEMP/libcurl
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
- name: Python setup
id: setup_python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r examples/server/tests/requirements.txt
- name: Copy Libcurl
id: prepare_libcurl
run: |
cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
- name: Tests
id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: |
cd examples/server/tests
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd examples/server/tests
behave.exe --stop --no-skipped --no-capture --tags slow

20
.github/workflows/tidy-post.yml vendored Normal file
View File

@@ -0,0 +1,20 @@
name: clang-tidy review post comments
on:
workflow_dispatch:
workflows: ["clang-tidy-review"]
types:
- completed
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: ZedThree/clang-tidy-review/post@v0.13.0
# lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
with:
# adjust options as necessary
lgtm_comment_body: ''
annotations: false
max_comments: 25

23
.github/workflows/tidy-review.yml vendored Normal file
View File

@@ -0,0 +1,23 @@
name: clang-tidy-review
on:
pull_request:
branches:
- master
jobs:
clang-tidy-review:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: ZedThree/clang-tidy-review@v0.13.0
id: review
with:
lgtm_comment_body: ''
build_dir: build
cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
split_workflow: true
- uses: ZedThree/clang-tidy-review/upload@v0.13.0

View File

@@ -6,10 +6,6 @@ on:
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
build:
strategy:

11
.gitignore vendored
View File

@@ -11,10 +11,7 @@
*.gcda
*.dot
*.bat
*.tmp
*.metallib
*.etag
*.lastModified
.DS_Store
.build/
.cache/
@@ -28,8 +25,6 @@
.vscode/
.idea/
ggml-metal-embed.metal
lcov-report/
gcovr-report/
@@ -50,8 +45,6 @@ models-mnt
/embedding
/gguf
/gguf-llama-simple
/gguf-split
/gritlm
/imatrix
/infill
/libllama.so
@@ -59,9 +52,6 @@ models-mnt
/llava-cli
/lookahead
/lookup
/lookup-create
/lookup-merge
/lookup-stats
/main
/metal
/passkey
@@ -77,7 +67,6 @@ models-mnt
/batched-bench
/export-lora
/finetune
/retrieval
/speculative
/parallel
/train-text-from-scratch

View File

@@ -89,8 +89,8 @@ endif()
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUDA "llama: use CUDA" OFF)
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
@@ -99,8 +99,6 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access")
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
@@ -118,9 +116,7 @@ option(LLAMA_MPI "llama: use MPI"
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF)
set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism")
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@@ -150,8 +146,6 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
include(CheckCXXCompilerFlag)
add_compile_definitions(GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES})
# enable libstdc++ assertions for debug builds
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
@@ -202,70 +196,63 @@ if (LLAMA_METAL)
add_compile_definitions(GGML_METAL_NDEBUG)
endif()
# copy ggml-common.h and ggml-metal.metal to bin directory
configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY)
# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
if (LLAMA_METAL_EMBED_LIBRARY)
enable_language(ASM)
add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h")
set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
# merge ggml-common.h and ggml-metal.metal into a single file
set(METALLIB_EMBED_ASM "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
add_custom_command(
OUTPUT ${METALLIB_EMBED_ASM}
COMMAND echo "Embedding Metal library"
COMMAND sed -e '/\#include \"ggml-common.h\"/r ${METALLIB_COMMON}' -e '/\#include \"ggml-common.h\"/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED}
COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM}
COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM}
COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM}
COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM}
COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM}
DEPENDS ggml-metal.metal ggml-common.h
OUTPUT ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
DEPENDS ${METALLIB_SOURCE}
COMMENT "Generate assembly for embedded Metal library"
)
set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM})
else()
if (LLAMA_METAL_SHADER_DEBUG)
# custom command to do the following:
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
# xcrun -sdk macosx metallib ggml-metal.air -o default.metallib
#
# note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
# disabling fast math is needed in order to pass tests/test-backend-ops
# note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
# note: unfortunately, we have to call it default.metallib instead of ggml.metallib
# ref: https://github.com/ggerganov/whisper.cpp/issues/1720
set(XC_FLAGS -fno-fast-math -fno-inline -g)
else()
set(XC_FLAGS -O3)
set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
endif()
if (LLAMA_METAL_SHADER_DEBUG)
# custom command to do the following:
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
# xcrun -sdk macosx metallib ggml-metal.air -o default.metallib
#
# note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
# disabling fast math is needed in order to pass tests/test-backend-ops
# note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
# note: unfortunately, we have to call it default.metallib instead of ggml.metallib
# ref: https://github.com/ggerganov/whisper.cpp/issues/1720
set(XC_FLAGS -fno-fast-math -fno-inline -g)
if (LLAMA_QKK_64)
set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
endif()
add_custom_command(
OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
DEPENDS ggml-metal.metal ggml-common.h
DEPENDS ggml-metal.metal
COMMENT "Compiling Metal kernels"
)
)
add_custom_target(
ggml-metal ALL
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
)
endif() # LLAMA_METAL_EMBED_LIBRARY
)
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
@@ -360,25 +347,18 @@ if (LLAMA_QKK_64)
endif()
if (LLAMA_CUBLAS)
message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
set(LLAMA_CUDA ON)
endif()
if (LLAMA_CUDA)
cmake_minimum_required(VERSION 3.17)
find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "CUDA found")
message(STATUS "cuBLAS found")
enable_language(CUDA)
set(GGML_HEADERS_CUDA ggml-cuda.h)
set(GGML_SOURCES_CUDA ggml-cuda.cu)
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
add_compile_definitions(GGML_USE_CUDA)
add_compile_definitions(GGML_USE_CUBLAS)
if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()
@@ -395,9 +375,6 @@ if (LLAMA_CUDA)
endif()
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
if (LLAMA_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
if (LLAMA_STATIC)
if (WIN32)
@@ -427,7 +404,7 @@ if (LLAMA_CUDA)
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
else()
message(WARNING "CUDA not found")
message(WARNING "cuBLAS not found")
endif()
endif()
@@ -526,11 +503,9 @@ if (LLAMA_HIPBLAS)
message(STATUS "HIP and hipBLAS found")
set(GGML_HEADERS_ROCM ggml-cuda.h)
set(GGML_SOURCES_ROCM ggml-cuda.cu)
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
if (LLAMA_HIP_UMA)
add_compile_definitions(GGML_HIP_UMA)
@@ -544,15 +519,11 @@ if (LLAMA_HIPBLAS)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()
if (LLAMA_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
if (LLAMA_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
@@ -562,10 +533,6 @@ if (LLAMA_HIPBLAS)
endif()
if (LLAMA_SYCL)
if (NOT LLAMA_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
endif()
if ( NOT DEFINED ENV{ONEAPI_ROOT})
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
endif()
@@ -587,9 +554,6 @@ if (LLAMA_SYCL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
endif()
set(GGML_HEADERS_SYCL ggml-sycl.h)
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
@@ -597,11 +561,7 @@ if (LLAMA_SYCL)
if (WIN32)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
else()
if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl pthread m dl onemkl)
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
endif()
endif()
@@ -835,7 +795,7 @@ endif()
set(CUDA_CXX_FLAGS "")
if (LLAMA_CUDA)
if (LLAMA_CUBLAS)
set(CUDA_FLAGS -use_fast_math)
if (LLAMA_FATAL_WARNINGS)
@@ -976,16 +936,10 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
# Android armeabi-v7a
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
else()
# Raspberry Pi 2
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
endif()
# Raspberry Pi 2
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Android arm64-v8a
# Raspberry Pi 3, 4, Zero 2 (32-bit)
list(APPEND ARCH_FLAGS -mno-unaligned-access)
endif()
@@ -1060,7 +1014,7 @@ endif()
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
if (LLAMA_CUDA)
if (LLAMA_CUBLAS)
list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
@@ -1168,9 +1122,6 @@ endif()
add_library(llama
llama.cpp
llama.h
unicode.h
unicode.cpp
unicode-data.cpp
)
target_include_directories(llama PUBLIC .)
@@ -1266,12 +1217,6 @@ if (LLAMA_METAL)
GROUP_READ
WORLD_READ
DESTINATION ${CMAKE_INSTALL_BINDIR})
if (NOT LLAMA_METAL_EMBED_LIBRARY)
install(
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
DESTINATION ${CMAKE_INSTALL_BINDIR}
)
endif()
endif()
#

154
Makefile
View File

@@ -1,16 +1,15 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
# Binaries only useful for tests
TEST_TARGETS = \
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
tests/test-json-schema-to-grammar
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
# Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -168,10 +167,6 @@ ifeq ($(UNAME_S),OpenBSD)
MK_CPPFLAGS += -D_BSD_SOURCE
endif
ifdef LLAMA_SCHED_MAX_COPIES
MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
endif
ifdef LLAMA_DEBUG
MK_CFLAGS += -O0 -g
MK_CXXFLAGS += -O0 -g
@@ -206,10 +201,6 @@ ifdef LLAMA_SERVER_VERBOSE
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
endif
ifdef LLAMA_SERVER_SSL
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
MK_LDFLAGS += -lssl -lcrypto
endif
ifdef LLAMA_CODE_COVERAGE
MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
@@ -390,20 +381,9 @@ ifdef LLAMA_BLIS
endif # LLAMA_BLIS
ifdef LLAMA_CUBLAS
# LLAMA_CUBLAS is deprecated and will be removed in the future
LLAMA_CUDA := 1
endif
ifdef LLAMA_CUDA
ifneq ('', '$(wildcard /opt/cuda)')
CUDA_PATH ?= /opt/cuda
else
CUDA_PATH ?= /usr/local/cuda
endif
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
MK_NVCCFLAGS += -use_fast_math
ifdef LLAMA_FATAL_WARNINGS
MK_NVCCFLAGS += -Werror all-warnings
@@ -458,30 +438,19 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
else
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
ifdef LLAMA_CUDA_NO_PEER_COPY
MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
endif # LLAMA_CUDA_NO_PEER_COPY
#ifdef LLAMA_CUDA_CUBLAS
# MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
#endif # LLAMA_CUDA_CUBLAS
ifdef LLAMA_CUDA_CCBIN
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
ifdef JETSON_EOL_MODULE_DETECT
define NVCC_COMPILE
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endef # NVCC_COMPILE
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
else
define NVCC_COMPILE
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endef # NVCC_COMPILE
endif # JETSON_EOL_MODULE_DETECT
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
$(NVCC_COMPILE)
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(NVCC_COMPILE)
endif # LLAMA_CUDA
endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST
@@ -527,6 +496,7 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
endif # LLAMA_VULKAN
ifdef LLAMA_HIPBLAS
ifeq ($(wildcard /opt/rocm),)
ROCM_PATH ?= /usr
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
@@ -538,7 +508,7 @@ ifdef LLAMA_HIPBLAS
LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
ifdef LLAMA_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA
endif # LLAMA_HIP_UMA
@@ -551,18 +521,9 @@ endif # LLAMA_HIP_UMA
ifdef LLAMA_CUDA_FORCE_DMMV
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # LLAMA_CUDA_FORCE_DMMV
ifdef LLAMA_CUDA_NO_PEER_COPY
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
endif # LLAMA_CUDA_NO_PEER_COPY
OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
endif # LLAMA_HIPBLAS
ifdef LLAMA_METAL
@@ -579,20 +540,19 @@ endif
endif # LLAMA_METAL
ifdef LLAMA_METAL
ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h
ggml-metal.o: ggml-metal.m ggml-metal.h
$(CC) $(CFLAGS) -c $< -o $@
ifdef LLAMA_METAL_EMBED_LIBRARY
ggml-metal-embed.o: ggml-metal.metal ggml-common.h
ggml-metal-embed.o: ggml-metal.metal
@echo "Embedding Metal library"
@sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
$(eval TEMP_ASSEMBLY=$(shell mktemp))
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
@echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
@$(AS) $(TEMP_ASSEMBLY) -o $@
@rm -f ${TEMP_ASSEMBLY}
endif
@@ -615,17 +575,12 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
# identify CUDA host compiler
ifdef LLAMA_CUDA
ifdef LLAMA_CUBLAS
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
include scripts/get-flags.mk
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
endif
ifdef LLAMA_CURL
override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
override LDFLAGS := $(LDFLAGS) -lcurl
endif
#
# Print build information
#
@@ -640,9 +595,9 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
$(info I LDFLAGS: $(LDFLAGS))
$(info I CC: $(shell $(CC) --version | head -n 1))
$(info I CXX: $(shell $(CXX) --version | head -n 1))
ifdef LLAMA_CUDA
ifdef LLAMA_CUBLAS
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
ifndef CUDA_DOCKER_ARCH
ifndef CUDA_POWER_ARCH
@@ -650,16 +605,9 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
endif # LLAMA_CUDA
endif # LLAMA_CUBLAS
$(info )
ifdef LLAMA_CUBLAS
$(info !!!!)
$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
$(info !!!!)
$(info )
endif
#
# Build library
#
@@ -673,18 +621,12 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
$(CC) $(CFLAGS) -c $< -o $@
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
$(CC) $(CFLAGS) -c $< -o $@
unicode.o: unicode.cpp unicode.h
$(CXX) $(CXXFLAGS) -c $< -o $@
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
unicode-data.o: unicode-data.cpp unicode-data.h
$(CXX) $(CXXFLAGS) -c $< -o $@
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
@@ -702,15 +644,9 @@ console.o: common/console.cpp common/console.h
grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
$(CXX) $(CXXFLAGS) -c $< -o $@
json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h
$(CXX) $(CXXFLAGS) -c $< -o $@
train.o: common/train.cpp common/train.h
$(CXX) $(CXXFLAGS) -c $< -o $@
ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
$(CXX) $(CXXFLAGS) -c $< -o $@
libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
@@ -718,8 +654,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
clean:
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
rm -vrf ggml-cuda/*.o
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
find examples pocs -type f -name "*.o" -delete
#
@@ -780,26 +715,19 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -837,10 +765,6 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -853,15 +777,9 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@@ -918,10 +836,6 @@ tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

View File

@@ -31,8 +31,6 @@ let package = Package(
sources: [
"ggml.c",
"llama.cpp",
"unicode.cpp",
"unicode-data.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",

View File

@@ -1,9 +1,8 @@
# llama.cpp for SYCL
- [Background](#background)
- [News](#news)
- [OS](#os)
- [Supported Devices](#supported-devices)
- [Intel GPU](#intel-gpu)
- [Docker](#docker)
- [Linux](#linux)
- [Windows](#windows)
@@ -14,42 +13,17 @@
## Background
**SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17.
SYCL is a higher-level programming model to improve programming productivity on various hardware acceleratorssuch as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
### Llama.cpp + SYCL
This SYCL "backend" follows the same design found in other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. The oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
The llama.cpp SYCL backend supports:
- Intel GPUs.
- Nvidia GPUs.
The llama.cpp for SYCL is used to support Intel GPUs.
*Upcoming support: AMD GPUs*.
When targetting **Intel CPUs**, it is recommended to use llama.cpp for [x86_64](README.md#intel-onemkl) approach.
## News
- 2024.3
- A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
- New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
- Support detecting all GPUs with level-zero and same top **Max compute units**.
- Support OPs
- hardsigmoid
- hardswish
- pool2d
- 2024.1
- Create SYCL backend for Intel GPU.
- Support Windows build
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
## OS
@@ -59,16 +33,9 @@ When targetting **Intel CPUs**, it is recommended to use llama.cpp for [x86_64]
|Windows|Support|Windows 11|
## Supported devices
## Intel GPU
### Intel GPUs
The oneAPI Math Kernel Library, which the oneAPI base-toolkit includes, supports intel GPUs. In order to make it "visible", simply run the following:
```sh
source /opt/intel/oneapi/setvars.sh
```
- **Tested devices**
### Verified
|Intel GPU| Status | Verified Model|
|-|-|-|
@@ -78,335 +45,259 @@ source /opt/intel/oneapi/setvars.sh
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
*Notes:*
Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
- Device memory can be a limitation when running a large model on an intel GPU. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
### Memory
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPUs and 4.0GB for discrete GPUs.
The memory is a limitation to run LLM on GPUs.
- If the iGPU has less than 80 EUs *(Execution Unit)*, the inference speed will likely be too slow for practical use.
When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors: buffer size = 3577.56 MiB`.
### Nvidia GPUs
The BLAS acceleration on Nvidia GPUs through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)
- **Tested devices**
|Nvidia GPU| Status | Verified Model|
|-|-|-|
|Ampere Series| Support| A100, A4000|
|Ampere Series *(Mobile)*| Support| RTX 40 Series|
*Notes:*
- Support for Nvidia targets through oneAPI is currently limited to Linux platforms.
- Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs.
For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+.
For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
## Docker
The docker build option is currently limited to *intel GPU* targets.
### Build image
Note:
- Only docker on Linux is tested. Docker on WSL may not work.
- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
### Build the image
You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
```sh
# Using FP16
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
# For F16:
#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
# Or, for F32:
docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example
```
*Notes*:
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
### Run container
### Run
```sh
# First, find all the DRI cards
# Firstly, find all the DRI cards:
ls -la /dev/dri
# Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
# Then, pick the card that you want to use.
# For example with "/dev/dri/card1"
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
*Notes:*
- Docker has been tested successfully on native Linux. WSL support has not been verified yet.
- You may need to install Intel GPU driver on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
## Linux
### I. Setup Environment
### Setup Environment
1. **Install GPU drivers**
1. Install Intel GPU driver.
- **Intel GPU**
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
Intel data center GPUs drivers installation guide and download page can be found here: [Get intel dGPU Drivers](https://dgpu-docs.intel.com/driver/installation.html#ubuntu-install-steps).
Note: for iGPU, please install the client GPU driver.
*Note*: for client GPUs *(iGPU & Arc A-Series)*, please refer to the [client iGPU driver installation](https://dgpu-docs.intel.com/driver/client/overview.html).
Once installed, add the user(s) to the `video` and `render` groups.
b. Add user to group: video, render.
```sh
sudo usermod -aG render $USER
sudo usermod -aG video $USER
sudo usermod -aG render username
sudo usermod -aG video username
```
*Note*: logout/re-login for the changes to take effect.
Note: re-login to enable it.
Verify installation through `clinfo`:
c. Check
```sh
sudo apt install clinfo
sudo clinfo -l
```
Sample output:
Output (example):
```sh
```
Platform #0: Intel(R) OpenCL Graphics
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
Platform #0: Intel(R) OpenCL HD Graphics
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
```
- **Nvidia GPU**
2. Install Intel® oneAPI Base toolkit.
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
Installation can be verified by running the following:
```sh
nvidia-smi
```
Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*:
```
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03 Driver Version: 535.54.03 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA A100-PCIE-40GB On | 00000000:8D:00.0 Off | 0 |
| N/A 36C P0 57W / 250W | 4MiB / 40960MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
```
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
Recommend to install to default folder: **/opt/intel/oneapi**.
2. **Install Intel® oneAPI Base toolkit**
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
- **Base installation**
b. Check
The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
Please follow the instructions for downloading and installing the Toolkit for Linux, and preferably keep the default installation values unchanged, notably the installation path *(`/opt/intel/oneapi` by default)*.
Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
- **Adding support to Nvidia GPUs**
**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
```sh
git clone https://github.com/oneapi-src/oneMKL
cd oneMKL
mkdir -p buildWithCublas && cd buildWithCublas
cmake ../ -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
make
```
3. **Verify installation and environment**
In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
```sh
source /opt/intel/oneapi/setvars.sh
sycl-ls
```
- **Intel GPU**
When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below:
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
Output (example):
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
```
- **Nvidia GPU**
2. Build locally:
Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2]
```
Note:
- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
### II. Build llama.cpp
#### Intel GPU
```sh
# Export relevant ENV variables
mkdir -p build
cd build
source /opt/intel/oneapi/setvars.sh
# Build LLAMA with MKL BLAS acceleration for intel GPU
mkdir -p build && cd build
# For FP16:
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 1: Use FP16 for better performance in long-prompt inference
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 2: Use FP32 by default
# Or, for FP32:
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Build example/main only
#cmake --build . --config Release --target main
# Or, build all binary
cmake --build . --config Release -v
cd ..
```
#### Nvidia GPU
or
```sh
# Export relevant ENV variables
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with Nvidia BLAS acceleration through SYCL
mkdir -p build && cd build
# Option 1: Use FP16 for better performance in long-prompt inference
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 2: Use FP32 by default
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
./examples/sycl/build.sh
```
### III. Run the inference
### Run
1. Retrieve and prepare model
1. Put model file to folder **models**
You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
2. Enable oneAPI running environment
```sh
```
source /opt/intel/oneapi/setvars.sh
```
3. List devices information
3. List device ID
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
Run without parameter:
```sh
./build/bin/ls-sycl-device
# or running the "main" executable and look at the output log:
./build/bin/main
```
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
Check the ID in startup log, like:
```
found 6 SYCL devices:
| | | |Compute |Max compute|Max work|Max sub| |
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
found 4 SYCL devices:
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
```
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero driver/runtime, recommended |
|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
4. Launch inference
4. Set device ID and execute llama.cpp
There are two device selection modes:
- Single device: Use one device target specified by the user.
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
|Device selection|Parameter|
|-|-|
|Single device|--split-mode none --main-gpu DEVICE_ID |
|Multiple devices|--split-mode layer (default)|
Examples:
- Use device 0:
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
or run by script:
```sh
./examples/sycl/run_llama2.sh 0
```
- Use multiple devices:
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
```
Otherwise, you can run the script:
```sh
./examples/sycl/run_llama2.sh
```
*Notes:*
Note:
- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
```sh
detect 1 SYCL GPUs: [0] with top Max compute units:512
5. Check the device ID in output
Like:
```
Or
```sh
use 1 SYCL GPUs: [0] with Max compute units:512
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
```
## Windows
### I. Setup Environment
### Setup Environment
1. Install GPU driver
1. Install Intel GPU driver.
Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
2. Install Visual Studio
Note: **The driver is mandatory for compute function**.
If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/).
2. Install Visual Studio.
3. Install Intel® oneAPI Base toolkit
Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows.
The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
3. Install Intel® oneAPI Base toolkit.
Please follow the instructions for downloading and installing the Toolkit for Windows, and preferably keep the default installation values unchanged, notably the installation path *(`C:\Program Files (x86)\Intel\oneAPI` by default)*.
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
b. Enable oneAPI running environment:
- Type "oneAPI" in the search bar, then open the `Intel oneAPI command prompt for Intel 64 for Visual Studio 2022` App.
- In Search, input 'oneAPI'.
- On the command prompt, enable the runtime environment with the following:
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
- In Run:
In CMD:
```
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
```
c. Verify installation
c. Check GPU
In the oneAPI command line, run the following to print the available SYCL devices:
In oneAPI command line:
```
sycl-ls
```
There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device:
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
Output (example):
```
@@ -416,74 +307,96 @@ Output (example):
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
```
4. Install build tools
4. Install cmake & make
a. Download & install cmake for Windows: https://cmake.org/download/
b. Download & install mingw-w64 make for Windows provided by w64devkit
- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
- Extract `w64devkit` on your pc.
- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`.
### II. Build llama.cpp
### Build locally:
On the oneAPI command line window, step into the llama.cpp main directory and run the following:
In oneAPI command line window:
```
mkdir -p build
cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
:: for FP16
:: faster for long-prompt inference
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
make
:: for FP32
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
:: build example/main only
:: make main
:: build all binary
make -j
cd ..
```
Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
```sh
or
```
.\examples\sycl\win-build-sycl.bat
```
*Notes:*
Note:
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
### III. Run the inference
### Run
1. Retrieve and prepare model
1. Put model file to folder **models**
You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
2. Enable oneAPI running environment
On the oneAPI command line window, run the following and step into the llama.cpp directory:
- In Search, input 'oneAPI'.
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
- In Run:
In CMD:
```
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
```
3. List devices information
3. List device ID
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
Run without parameter:
```
build\bin\ls-sycl-device.exe
or
build\bin\main.exe
```
The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
Check the ID in startup log, like:
```
found 6 SYCL devices:
| | | |Compute |Max compute|Max work|Max sub| |
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
found 4 SYCL devices:
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
```
@@ -492,33 +405,15 @@ found 6 SYCL devices:
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
4. Set device ID and execute llama.cpp
4. Launch inference
There are two device selection modes:
- Single device: Use one device assigned by user.
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
|Device selection|Parameter|
|-|-|
|Single device|--split-mode none --main-gpu DEVICE_ID |
|Multiple devices|--split-mode layer (default)|
Examples:
- Use device 0:
Set device ID = 0 by **set GGML_SYCL_DEVICE=0**
```
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
set GGML_SYCL_DEVICE=0
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0
```
- Use multiple devices:
```
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
```
Otherwise, run the following wrapper script:
or run by script:
```
.\examples\sycl\win-run-llama2.bat
@@ -526,15 +421,14 @@ Otherwise, run the following wrapper script:
Note:
- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
```sh
detect 1 SYCL GPUs: [0] with top Max compute units:512
5. Check the device ID in output
Like:
```
Or
```sh
use 1 SYCL GPUs: [0] with Max compute units:512
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
```
## Environment Variable
@@ -543,55 +437,58 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|Name|Value|Function|
|-|-|-|
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.|
|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.|
|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.|
|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.|
|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.|
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path|
#### Running
#### Runtime
|Name|Value|Function|
|-|-|-|
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
## Known Issues
## Known Issue
- Hanging during startup
- Hang during startup
llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang.
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
- **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable.
- `Split-mode:[row]` is not supported.
Solution: add **--no-mmap** or **--mmap 0**.
## Q&A
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
- Potential cause: Unavailable oneAPI installation or not set ENV variables.
- Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`.
Miss to enable oneAPI running environment.
- General compiler error:
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
- Remove build folder or try a clean-build.
- In Windows, no result, not error.
- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
Miss to enable oneAPI running environment.
Please double-check with `sudo sycl-ls`.
- Meet compile error.
If it's present in the list, please add video/render group to your user then **logout/login** or restart your system:
Remove folder **build** and try again.
- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux.
Please run **sudo sycl-ls**.
If you see it in result, please add video/render group to your ID:
```
sudo usermod -aG render $USER
sudo usermod -aG video $USER
sudo usermod -aG render username
sudo usermod -aG video username
```
Otherwise, please double-check the GPU driver installation steps.
### **GitHub contribution**:
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
Then **relogin**.
If you do not see it, please check the installation GPU steps again.
## Todo
- Support row layer split for multiple card runs.
- Support multiple cards.

View File

@@ -8,22 +8,12 @@
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
### Recent API changes
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
### Hot topics
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187
- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
----
@@ -114,21 +104,16 @@ Typically finetunes of the base models below are supported as well.
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
- [x] [Gemma](https://ai.google.dev/gemma)
- [x] [Mamba](https://github.com/state-spaces/mamba)
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
**Multimodal models:**
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e)
- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
**HTTP server**
[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
**Bindings:**
@@ -137,7 +122,6 @@ Typically finetunes of the base models below are supported as well.
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
@@ -148,7 +132,6 @@ Typically finetunes of the base models below are supported as well.
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
**UI:**
@@ -169,14 +152,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
- [RecurseChat](https://recurse.chat/) (proprietary)
- [semperai/amica](https://github.com/semperai/amica)
- [withcatai/catai](https://github.com/withcatai/catai)
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
- [Msty](https://msty.app) (proprietary)
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
---
@@ -452,27 +430,30 @@ Building the program with BLAS support may lead to some performance improvements
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
- #### CUDA
- #### cuBLAS
This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
- Using `make`:
```bash
make LLAMA_CUDA=1
make LLAMA_CUBLAS=1
```
- Using `CMake`:
```bash
mkdir build
cd build
cmake .. -DLLAMA_CUDA=ON
cmake .. -DLLAMA_CUBLAS=ON
cmake --build . --config Release
```
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
<!---
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
--->
| Option | Legal values | Default | Description |
|--------------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
@@ -634,15 +615,6 @@ Building the program with BLAS support may lead to some performance improvements
- #### Vulkan
> [!WARNING]
>
> Vulkan support has been broken in https://github.com/ggerganov/llama.cpp/pull/6122
> due to relying on `GGML_OP_GET_ROWS` which is not yet properly supported by the Vulkan backend,
> but should be fixed relatively soon (possibly in https://github.com/ggerganov/llama.cpp/pull/6155
> (ref: https://github.com/ggerganov/llama.cpp/pull/6122#issuecomment-2015327635)).
>
> Meanwhile, if you want to use the Vulkan backend, you should use the commit right before the breaking change, https://github.com/ggerganov/llama.cpp/commit/55c1b2a3bbd470e9e2a3a0618b92cf64a885f806
**With docker**:
You don't need to install Vulkan SDK. It will be installed inside the container.
@@ -808,7 +780,7 @@ And after 4.45 hours, you will have the final perplexity.
### Interactive mode
If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
Here is an example of a few-shot interaction, invoked with the command
@@ -872,7 +844,7 @@ Sample run:
```
== Running in interactive mode. ==
- Press Ctrl+C to interject at any time.
- Press Return to return control to LLaMA.
- Press Return to return control to LLaMa.
- If you want to submit another line, end your input in '\'.
Below is an instruction that describes a task. Write a response that appropriately completes the request.
@@ -919,9 +891,6 @@ First, install the essential packages for termux:
pkg install clang wget git cmake
```
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux.
```
$ mkdir build-android
$ cd build-android
@@ -930,28 +899,7 @@ $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROI
$ make
```
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
```
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
$cd /data/data/com.termux/files/home/bin
$chmod +x ./*
```
Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
```
$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/
```
Now, you can start chatting:
```
$cd /data/data/com.termux/files/home/bin
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
```
Here is a demo of an interactive session running on Pixel 5 phone:
Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone:
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4

116
awq-py/README.md Normal file
View File

@@ -0,0 +1,116 @@
# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp
[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)]
**Supported models:**
- [X] LLaMA
- [x] LLaMA 2
- [X] MPT
- [X] Mistral AI v0.1
- [ ] Bloom
- [ ] Mixtral MoE
**TODO:**
- [x] Update version work with both MPT and MPT-AWQ model
- [ ] Add OPT model
- [ ] Add Bloom model
- [ ] Add Mixtral MoE
- [ ] Support w3, w2
## Contents
- [Install](##Install)
- [Convert](##Convert)
- [Quantize](##Quantize)
- [Test](##Test)
- [Benchmark](##Benchmark)
- [Results](##Results)
## Install
Install requirements
```bash
pip install -r requirements.txt
```
Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT
```bash
git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache
```
## Convert
Example for llama model
```bash
# For llama7b and llama2 models
python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
# For mistral and mpt models
python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
```
## Quantize
```bash
# We only benchmark and confirm the results on q4_0, q4_1, and q2_k types.
./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0
```
## Test
```bash
# For all models.
./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time"
```
## Benchmark
The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
```bash
# For llama and llama2, and mistral models.
./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw
```
## Results
Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k
### Llama 7B (Build with OpenBLAS)
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|-----------:|--------------|-------:|-------:|-------:|-------:|
|Llama 7B | perplexity | 5.9066 | 6.1214 | 6.0643 | 6.5808 |
|Llama 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G |
|Llama 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|AWQ-LLama 7B| perplexity | 5.9175 | 6.0252 | 5.9987 | 6.3692 |
|AWQ-LLama 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G |
|AWQ-LLama 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
### Llama2 7B (Build with CuBLAS)
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|------------:|--------------|-------:|-------:|-------:|-------:|
|Llama2 7B | perplexity | 5.8664 | 6.0260 | 6.0656 | 6.4496 |
|Llama2 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G |
|Llama2 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|AWQ-LLama2 7B| perplexity | 5.8801 | 6.0054 | 5.9849 | 6.3650 |
|AWQ-LLama2 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G |
|AWQ-LLama2 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
### Mistral 7B v0.1 (Build with CuBLAS)
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|-------------:|--------------|-------:|-------:|-------:|-------:|
|Mistral 7B | perplexity | 5.6931 | 5.8202 | 5.8268 | 6.1645 |
|Mistral 7B | file size | 14.5G | 4.1G | 4.5G | 3.1G |
|Mistral 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|AWQ-Mistral 7B| perplexity | 5.6934 | 5.8020 | 5.7691 | 6.0426 |
|AWQ-Mistral 7B| file size | 14.5G | 4.1G | 4.5G | 3.1G |
|AWQ-Mistral 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
### MPT 7B (Build with OpenBLAS)
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|---------:|--------------|-------:|-------:|-------:|--------:|
|MPT 7B | perplexity | 8.4369 | 8.7956 | 8.6265 | 11.4913 |
|MPT 7B | file size | 13.7G | 3.9G | 4.3G | 2.8G |
|MPT 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|AWQ-MPT 7B| perplexity | 8.4944 | 8.7053 | 8.6750 | 10.2873|
|AWQ-MPT 7B| file size | 13.7G | 3.9G | 4.3G | 2.8G |
|AWQ-MPT 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |

254
awq-py/awq/apply_awq.py Normal file
View File

@@ -0,0 +1,254 @@
"""
Implements the AWQ for llama.cpp use cases.
Original paper: https://arxiv.org/abs/2306.00978
This code is based on versions of the AWQ implementation found in the following repositories:
* https://github.com/mit-han-lab/llm-awq
* https://github.com/casper-hansen/AutoAWQ
"""
import os
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoConfig
from transformers.models.bloom.modeling_bloom import BloomGelu
from transformers.models.llama.modeling_llama import LlamaRMSNorm
from transformers.activations import GELUActivation
class ScaledActivation(nn.Module):
"""
ScaledActivation module wraps an existing activation function and applies a
scale factor to its output.
Args:
module (nn.Module): The activation function to be scaled.
scales (torch.Tensor): A tensor of size (num_features,) containing the initial
scale factors for each feature.
Returns:
torch.Tensor: The scaled output of the activation function.
"""
def __init__(self, module, scales):
super().__init__()
self.act = module
self.scales = nn.Parameter(scales.data)
def forward(self, x):
return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
def set_op_by_name(layer, name, new_module):
"""
Set the new module for given module's name.
Args:
layer (nn.Module): The layer in which to replace the submodule.
name (str): The path to the submodule to be replaced, using dot notation
to access nested modules.
new_module (nn.Module): The new module to replace the existing one.
"""
levels = name.split(".")
if len(levels) > 1:
mod_ = layer
for l_idx in range(len(levels) - 1):
if levels[l_idx].isdigit():
mod_ = mod_[int(levels[l_idx])]
else:
mod_ = getattr(mod_, levels[l_idx])
setattr(mod_, levels[-1], new_module)
else:
setattr(layer, name, new_module)
def get_op_by_name(module, op_name):
"""
Retrieves a submodule within a given layer based on its name.
Args:
module (nn.Module): The layer containing the submodule to find.
op_name (str): The name of the submodule.
Returns:
nn.Module: The requested submodule found within the given layer.
Raises:
ValueError: If the specified submodule cannot be found within the layer.
"""
for name, m in module.named_modules():
if name == op_name:
return m
raise ValueError(f"Cannot find op {op_name} in module {module}")
@torch.no_grad()
def scale_ln_fcs(ln, fcs, scales):
"""
Scales the weights of a LayerNorm and a list of fully-connected layers proportionally.
Args:
ln (nn.LayerNorm): The LayerNorm module to be scaled.
fcs (List[nn.Linear]): A list of fully-connected layers to be scaled.
scales (torch.Tensor): A 1D tensor of size (num_features,).
"""
if not isinstance(fcs, list):
fcs = [fcs]
scales = scales.to(ln.weight.device)
ln.weight.div_(scales)
if hasattr(ln, "bias") and ln.bias is not None:
ln.bias.div_(scales)
for fc in fcs:
fc.weight.mul_(scales.view(1, -1))
for p in ln.parameters():
assert torch.isnan(p).sum() == 0
for fc in fcs:
for p in fc.parameters():
assert torch.isnan(p).sum() == 0
@torch.no_grad()
def scale_fc_fc(fc1, fc2, scales):
"""
Scales the weights of two fully-connected layers in a specific pattern.
Args:
fc1 (nn.Linear): The first fully-connected layer to be scaled.
fc2 (nn.Linear): The second fully-connected layer to be scaled.
scales (torch.Tensor): A 1D tensor of size (num_features,).
"""
assert isinstance(fc1, nn.Linear)
assert isinstance(fc2, nn.Linear)
scales = scales.to(fc1.weight.device)
fc1.weight[-scales.size(0):].div_(scales.view(-1, 1))
if fc1.bias is not None:
fc1.bias.div_(scales.view(-1))
fc2.weight.mul_(scales.view(1, -1))
for p in fc1.parameters():
assert torch.isnan(p).sum() == 0
for p in fc2.parameters():
assert torch.isnan(p).sum() == 0
@torch.no_grad()
def scale_gelu_fc(gelu, fc, scales):
"""
Scales the weight of a GELU activation and a fully-connected layer proportionally.
Args:
gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled.
fc (nn.Linear): The fully-connected layer to be scaled.
scales (torch.Tensor): A 1D tensor of size (num_features,).
Raises:
TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`.
TypeError: If the `fc` module is not of type `nn.Linear`.
"""
assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation))
assert isinstance(fc, nn.Linear)
fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
for p in fc.parameters():
assert torch.isnan(p).sum() == 0
def apply_scale(module, scales_list, input_feat_dict=None):
"""
Applies different scaling strategies to layers based on their type and hierarchy within a given module.
Args:
module (nn.Module): The module containing the layers to be scaled.
scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing:
* prev_op_name (str): The name of the preceding operation or module,
relative to which the layers to be scaled are located.
* layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation.
* scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature.
input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding
input features (optional).
"""
for prev_op_name, layer_names, scales in scales_list:
prev_op = get_op_by_name(module, prev_op_name)
layers = [get_op_by_name(module, name) for name in layer_names]
prev_op.cuda()
for layer in layers:
layer.cuda()
scales.cuda()
if isinstance(prev_op, nn.Linear):
assert len(layers) == 1
scale_fc_fc(prev_op, layers[0], scales)
elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower():
scale_ln_fcs(prev_op, layers, scales)
elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)):
new_module = ScaledActivation(prev_op, scales)
set_op_by_name(module, prev_op_name, new_module)
scale_gelu_fc(prev_op, layers[0], scales)
else:
raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!")
# apply the scaling to input feat if given; prepare it for clipping
if input_feat_dict is not None:
for layer_name in layer_names:
inp = input_feat_dict[layer_name]
inp.div_(scales.view(1, -1).to(inp.device))
prev_op.cpu()
for layer in layers:
layer.cpu()
scales.cpu()
@torch.no_grad()
def apply_clip(module, clip_list):
"""
Applies element-wise clipping to the weight of a specific layer within a given module.
Args:
module (nn.Module): The module containing the layer to be clipped.
clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing:
* name (str): The name of the layer to be clipped, relative to the root of the module.
* max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight.
"""
for name, max_val in clip_list:
layer = get_op_by_name(module, name)
layer.cuda()
max_val = max_val.to(layer.weight.device)
org_shape = layer.weight.shape
layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
layer.weight.data = layer.weight.data.reshape(org_shape)
layer.cpu()
def add_scale_weights(model_path, scale_path, tmp_path):
"""
Adds pre-computed Activation Weight Quantization (AWQ) results to a model,
including scaling factors and clipping bounds.
Args:
model_path (str): Path to the pre-trained model to be equipped with AWQ.
scale_path (str): Path to the AWQ scale factors (.pt file).
tmp_path (str): Path to the temporary directory where the equipped model will be saved.
"""
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path, config=config, trust_remote_code=True
)
model.eval()
awq_results = torch.load(str(scale_path), map_location="cpu")
apply_scale(model, awq_results["scale"])
apply_clip(model, awq_results["clip"])
model.save_pretrained(str(tmp_path))
os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}")

2
awq-py/requirements.txt Normal file
View File

@@ -0,0 +1,2 @@
torch>=2.1.1
transformers>=4.32.0

View File

@@ -115,27 +115,24 @@ pub fn build(b: *std.build.Builder) !void {
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
const unicode = make.obj("unicode", "unicode.cpp");
const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
const llama = make.obj("llama", "llama.cpp");
const buildinfo = make.obj("common", "common/build-info.cpp");
const common = make.obj("common", "common/common.cpp");
const console = make.obj("console", "common/console.cpp");
const sampling = make.obj("sampling", "common/sampling.cpp");
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
const train = make.obj("train", "common/train.cpp");
const clip = make.obj("clip", "examples/llava/clip.cpp");
const llava = make.obj("llava", "examples/llava/llava.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}

View File

@@ -40,13 +40,12 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
fi
if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
if [ -z ${ONEAPI_ROOT} ]; then
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
echo "source /opt/intel/oneapi/setvars.sh"
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
exit 1
fi
@@ -273,19 +272,19 @@ function gg_run_open_llama_3b_v2 {
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
@@ -344,17 +343,17 @@ function gg_run_open_llama_3b_v2 {
python3 ../convert-lora-to-ggml.py ${path_lora}
# f16
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0 + f16 lora-base
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
set +e
@@ -412,8 +411,8 @@ function gg_run_open_llama_7b_v2 {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert.py ${path_models}

View File

@@ -19,12 +19,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
endif()
endif()
if(EXISTS "${GIT_DIR}/index")
set(GIT_INDEX "${GIT_DIR}/index")
else()
message(WARNING "Git index not found in git repository.")
set(GIT_INDEX "")
endif()
set(GIT_INDEX "${GIT_DIR}/index")
else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
set(GIT_INDEX "")
@@ -47,8 +42,6 @@ if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
set(TARGET json-schema-to-grammar)
add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h)
set(TARGET common)
@@ -62,28 +55,14 @@ add_library(${TARGET} STATIC
console.cpp
grammar-parser.h
grammar-parser.cpp
json.hpp
train.h
train.cpp
ngram-cache.h
ngram-cache.cpp
)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
set(LLAMA_COMMON_EXTRA_LIBS build_info)
# Use curl to download model url
if (LLAMA_CURL)
find_package(CURL REQUIRED)
add_definitions(-DLLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS})
find_library(CURL_LIBRARY curl REQUIRED)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
endif ()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)

File diff suppressed because it is too large Load Diff

View File

@@ -37,16 +37,13 @@ extern char const *LLAMA_COMMIT;
extern char const *LLAMA_COMPILER;
extern char const *LLAMA_BUILD_TARGET;
struct llama_control_vector_load_info;
int32_t get_num_physical_cores();
//
// CLI argument parsing
//
int32_t get_num_physical_cores();
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
uint32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_threads_draft = -1;
@@ -54,17 +51,17 @@ struct gpt_params {
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
int32_t n_draft = 8; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
float p_accept = 0.5f; // speculative decoding accept probability
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width.
@@ -78,32 +75,23 @@ struct gpt_params {
float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
// // sampling parameters
struct llama_sampling_params sparams;
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
std::string model_url = ""; // model url to download
std::string hf_repo = ""; // HF repo
std::string hf_file = ""; // HF file
std::string prompt = "";
std::string prompt_file = ""; // store the external prompt file name
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
std::string prompt = "";
std::string prompt_file = ""; // store the external prompt file name
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string logdir = ""; // directory in which to save YAML log files
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
std::string logits_file = ""; // file for saving *all* logits
std::string logdir = ""; // directory in which to save YAML log files
std::string logits_file = ""; // file for saving *all* logits
std::vector<llama_model_kv_override> kv_overrides;
@@ -111,11 +99,6 @@ struct gpt_params {
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
std::string lora_base = ""; // base model path for the lora adapter
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting)
@@ -131,6 +114,7 @@ struct gpt_params {
bool kl_divergence = false; // compute KL-divergence
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
@@ -143,7 +127,7 @@ struct gpt_params {
bool interactive_first = false; // wait for user input immediately
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool cont_batching = false; // insert new sequences for decoding on-the-fly
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
@@ -171,8 +155,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
std::string get_system_info(const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
@@ -198,9 +180,6 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
// Batch utils
void llama_batch_clear(struct llama_batch & batch);
@@ -279,39 +258,3 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
// Dump the KV cache view showing individual sequences in each cell (long output).
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
//
// Embedding utils
//
void llama_embd_normalize(const float * inp, float * out, int n);
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
//
// Control vector utils
//
struct llama_control_vector_data {
int n_embd;
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
std::vector<float> data;
};
struct llama_control_vector_load_info {
float strength;
std::string fname;
};
// Load control vectors, scale each by strength, and add them together.
// On error, returns {-1, empty}
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
//
// Split utils
//
static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

View File

@@ -278,22 +278,6 @@ namespace grammar_parser {
while (*pos) {
pos = parse_rule(state, pos);
}
// Validate the state to ensure that all rules are defined
for (const auto & rule : state.rules) {
for (const auto & elem : rule) {
if (elem.type == LLAMA_GRETYPE_RULE_REF) {
// Ensure that the rule at that location exists
if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
// Get the name of the rule that is missing
for (const auto & kv : state.symbol_ids) {
if (kv.second == elem.value) {
throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
}
}
}
}
}
}
return state;
} catch (const std::exception & err) {
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());

View File

@@ -1,721 +0,0 @@
#include "json-schema-to-grammar.h"
#include <algorithm>
#include <fstream>
#include <map>
#include <regex>
#include <sstream>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
using json = nlohmann::ordered_json;
const std::string SPACE_RULE = "\" \"?";
std::unordered_map<std::string, std::string> PRIMITIVE_RULES = {
{"boolean", "(\"true\" | \"false\") space"},
{"number", "(\"-\"? ([0-9] | [1-9] [0-9]*)) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space"},
{"integer", "(\"-\"? ([0-9] | [1-9] [0-9]*)) space"},
{"value", "object | array | string | number | boolean"},
{"object", "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space"},
{"array", "\"[\" space ( value (\",\" space value)* )? \"]\" space"},
{"uuid", "\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space"},
{"string", " \"\\\"\" (\n"
" [^\"\\\\] |\n"
" \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n"
" )* \"\\\"\" space"},
{"null", "\"null\" space"}
};
std::vector<std::string> OBJECT_RULE_NAMES = {"object", "array", "string", "number", "boolean", "null", "value"};
std::unordered_map<std::string, std::string> DATE_RULES = {
{"date", "[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )"},
{"time", "([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )"},
{"date-time", "date \"T\" time"},
{"date-string", "\"\\\"\" date \"\\\"\" space"},
{"time-string", "\"\\\"\" time \"\\\"\" space"},
{"date-time-string", "\"\\\"\" date-time \"\\\"\" space"}
};
static bool is_reserved_name(const std::string & name) {
static std::unordered_set<std::string> RESERVED_NAMES;
if (RESERVED_NAMES.empty()) {
RESERVED_NAMES.insert("root");
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
for (const auto &p : DATE_RULES) RESERVED_NAMES.insert(p.first);
}
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
}
std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]");
std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
{'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}
};
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
template <typename Iterator>
std::string join(Iterator begin, Iterator end, const std::string & separator) {
std::ostringstream result;
if (begin != end) {
result << *begin;
for (Iterator it = begin + 1; it != end; ++it) {
result << separator << *it;
}
}
return result.str();
}
static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
std::vector<std::string> tokens;
size_t start = 0;
size_t end = str.find(delimiter);
while (end != std::string::npos) {
tokens.push_back(str.substr(start, end - start));
start = end + delimiter.length();
end = str.find(delimiter, start);
}
tokens.push_back(str.substr(start));
return tokens;
}
static std::string repeat(const std::string & str, size_t n) {
if (n == 0) {
return "";
}
std::string result;
result.reserve(str.length() * n);
for (size_t i = 0; i < n; ++i) {
result += str;
}
return result;
}
static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) {
std::smatch match;
std::string result;
std::string::const_iterator searchStart(input.cbegin());
std::string::const_iterator searchEnd(input.cend());
while (std::regex_search(searchStart, searchEnd, match, regex)) {
result.append(searchStart, searchStart + match.position());
result.append(replacement(match));
searchStart = match.suffix().first;
}
result.append(searchStart, searchEnd);
return result;
}
static std::string format_literal(const std::string & literal) {
std::string escaped = replacePattern(literal, GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) {
char c = match.str()[0];
return GRAMMAR_LITERAL_ESCAPES.at(c);
});
return "\"" + escaped + "\"";
}
class SchemaConverter {
private:
std::function<json(const std::string &)> _fetch_json;
bool _dotall;
std::map<std::string, std::string> _rules;
std::unordered_map<std::string, json> _refs;
std::unordered_set<std::string> _refs_being_resolved;
std::vector<std::string> _errors;
std::vector<std::string> _warnings;
std::string _add_rule(const std::string & name, const std::string & rule) {
std::string esc_name = regex_replace(name, INVALID_RULE_CHARS_RE, "-");
if (_rules.find(esc_name) == _rules.end() || _rules[esc_name] == rule) {
_rules[esc_name] = rule;
return esc_name;
} else {
int i = 0;
while (_rules.find(esc_name + std::to_string(i)) != _rules.end() && _rules[esc_name + std::to_string(i)] != rule) {
i++;
}
std::string key = esc_name + std::to_string(i);
_rules[key] = rule;
return key;
}
}
std::string _generate_union_rule(const std::string & name, const std::vector<json> & alt_schemas) {
std::vector<std::string> rules;
for (size_t i = 0; i < alt_schemas.size(); i++) {
rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
}
return join(rules.begin(), rules.end(), " | ");
}
std::string _visit_pattern(const std::string & pattern, const std::string & name) {
if (!(pattern.front() == '^' && pattern.back() == '$')) {
_errors.push_back("Pattern must start with '^' and end with '$'");
return "";
}
std::string sub_pattern = pattern.substr(1, pattern.length() - 2);
std::unordered_map<std::string, std::string> sub_rule_ids;
size_t i = 0;
size_t length = sub_pattern.length();
using literal_or_rule = std::pair<std::string, bool>;
auto to_rule = [&](const literal_or_rule & ls) {
auto is_literal = ls.second;
auto s = ls.first;
return is_literal ? "\"" + s + "\"" : s;
};
std::function<literal_or_rule()> transform = [&]() -> literal_or_rule {
size_t start = i;
std::vector<literal_or_rule> seq;
auto get_dot = [&]() {
std::string rule;
if (_dotall) {
rule = "[\\U00000000-\\U0010FFFF]";
} else {
rule = "[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]";
}
return _add_rule("dot", rule);
};
// Joins the sequence, merging consecutive literals together.
auto join_seq = [&]() {
std::vector<literal_or_rule> ret;
std::string literal;
auto flush_literal = [&]() {
if (literal.empty()) {
return false;
}
ret.push_back(std::make_pair(literal, true));
literal.clear();
return true;
};
for (const auto & item : seq) {
auto is_literal = item.second;
if (is_literal) {
literal += item.first;
} else {
flush_literal();
ret.push_back(item);
}
}
flush_literal();
std::vector<std::string> results;
for (const auto & item : ret) {
results.push_back(to_rule(item));
}
return std::make_pair(join(results.begin(), results.end(), " "), false);
};
while (i < length) {
char c = sub_pattern[i];
if (c == '.') {
seq.push_back(std::make_pair(get_dot(), false));
i++;
} else if (c == '(') {
i++;
if (i < length) {
if (sub_pattern[i] == '?') {
_warnings.push_back("Unsupported pattern syntax");
}
}
seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false));
} else if (c == ')') {
i++;
if (start > 0 && sub_pattern[start - 1] != '(') {
_errors.push_back("Unbalanced parentheses");
}
return join_seq();
} else if (c == '[') {
std::string square_brackets = std::string(1, c);
i++;
while (i < length && sub_pattern[i] != ']') {
if (sub_pattern[i] == '\\') {
square_brackets += sub_pattern.substr(i, 2);
i += 2;
} else {
square_brackets += sub_pattern[i];
i++;
}
}
if (i >= length) {
_errors.push_back("Unbalanced square brackets");
}
square_brackets += ']';
i++;
seq.push_back(std::make_pair(square_brackets, false));
} else if (c == '|') {
seq.push_back(std::make_pair("|", false));
i++;
} else if (c == '*' || c == '+' || c == '?') {
seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
i++;
} else if (c == '{') {
std::string curly_brackets = std::string(1, c);
i++;
while (i < length && sub_pattern[i] != '}') {
curly_brackets += sub_pattern[i];
i++;
}
if (i >= length) {
_errors.push_back("Unbalanced curly brackets");
}
curly_brackets += '}';
i++;
auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
int min_times = 0;
int max_times = std::numeric_limits<int>::max();
try {
if (nums.size() == 1) {
min_times = max_times = std::stoi(nums[0]);
} else if (nums.size() != 2) {
_errors.push_back("Wrong number of values in curly brackets");
} else {
if (!nums[0].empty()) {
min_times = std::stoi(nums[0]);
}
if (!nums[1].empty()) {
max_times = std::stoi(nums[1]);
}
}
} catch (const std::invalid_argument & e) {
_errors.push_back("Invalid number in curly brackets");
return std::make_pair("", false);
}
auto &last = seq.back();
auto &sub = last.first;
auto sub_is_literal = last.second;
if (min_times == 0 && max_times == std::numeric_limits<int>::max()) {
sub += "*";
} else if (min_times == 0 && max_times == 1) {
sub += "?";
} else if (min_times == 1 && max_times == std::numeric_limits<int>::max()) {
sub += "+";
} else {
if (!sub_is_literal) {
std::string & sub_id = sub_rule_ids[sub];
if (sub_id.empty()) {
sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
}
sub = sub_id;
}
std::string result;
if (sub_is_literal && min_times > 0) {
result = "\"" + repeat(sub.substr(1, sub.length() - 2), min_times) + "\"";
} else {
for (int j = 0; j < min_times; j++) {
if (j > 0) {
result += " ";
}
result += sub;
}
}
if (min_times > 0 && min_times < max_times) {
result += " ";
}
if (max_times == std::numeric_limits<int>::max()) {
result += sub + "*";
} else {
for (int j = min_times; j < max_times; j++) {
if (j > min_times) {
result += " ";
}
result += sub + "?";
}
}
seq.back().first = result;
seq.back().second = false;
}
} else {
std::string literal;
auto is_non_literal = [&](char c) {
return NON_LITERAL_SET.find(c) != NON_LITERAL_SET.end();
};
while (i < length) {
if (sub_pattern[i] == '\\' && i < length - 1) {
char next = sub_pattern[i + 1];
if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.find(next) != ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.end()) {
i++;
literal += sub_pattern[i];
i++;
} else {
literal += sub_pattern.substr(i, 2);
i += 2;
}
} else if (sub_pattern[i] == '"') {
literal += "\\\"";
i++;
} else if (!is_non_literal(sub_pattern[i]) &&
(i == length - 1 || literal.empty() || sub_pattern[i + 1] == '.' || !is_non_literal(sub_pattern[i + 1]))) {
literal += sub_pattern[i];
i++;
} else {
break;
}
}
if (!literal.empty()) {
seq.push_back(std::make_pair(literal, true));
}
}
}
return join_seq();
};
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
}
std::string _resolve_ref(const std::string & ref) {
std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
_refs_being_resolved.insert(ref);
json resolved = _refs[ref];
ref_name = visit(resolved, ref_name);
_refs_being_resolved.erase(ref);
}
return ref_name;
}
std::string _build_object_rule(
const std::vector<std::pair<std::string, json>> & properties,
const std::unordered_set<std::string> & required,
const std::string & name,
const json & additional_properties)
{
std::vector<std::string> required_props;
std::vector<std::string> optional_props;
std::unordered_map<std::string, std::string> prop_kv_rule_names;
for (const auto & kv : properties) {
const auto &prop_name = kv.first;
const auto &prop_schema = kv.second;
std::string prop_rule_name = visit(prop_schema, name + (name.empty() ? "" : "-") + prop_name);
prop_kv_rule_names[prop_name] = _add_rule(
name + (name.empty() ? "" : "-") + prop_name + "-kv",
format_literal(json(prop_name).dump()) + " space \":\" space " + prop_rule_name
);
if (required.find(prop_name) != required.end()) {
required_props.push_back(prop_name);
} else {
optional_props.push_back(prop_name);
}
}
if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
std::string kv_rule = _add_rule(sub_name + "-kv", _add_rule("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
prop_kv_rule_names["*"] = kv_rule;
optional_props.push_back("*");
}
std::string rule = "\"{\" space ";
for (size_t i = 0; i < required_props.size(); i++) {
if (i > 0) {
rule += " \",\" space ";
}
rule += prop_kv_rule_names[required_props[i]];
}
if (!optional_props.empty()) {
rule += " (";
if (!required_props.empty()) {
rule += " \",\" space ( ";
}
std::function<std::string(const std::vector<std::string> &, bool)> get_recursive_refs = [&](const std::vector<std::string> & ks, bool first_is_optional) {
std::string res;
if (ks.empty()) {
return res;
}
std::string k = ks[0];
std::string kv_rule_name = prop_kv_rule_names[k];
if (k == "*") {
res = _add_rule(
name + (name.empty() ? "" : "-") + "additional-kvs",
kv_rule_name + " ( \",\" space " + kv_rule_name + " )*"
);
} else if (first_is_optional) {
res = "( \",\" space " + kv_rule_name + " )?";
} else {
res = kv_rule_name;
}
if (ks.size() > 1) {
res += " " + _add_rule(
name + (name.empty() ? "" : "-") + k + "-rest",
get_recursive_refs(std::vector<std::string>(ks.begin() + 1, ks.end()), true)
);
}
return res;
};
for (size_t i = 0; i < optional_props.size(); i++) {
if (i > 0) {
rule += " | ";
}
rule += get_recursive_refs(std::vector<std::string>(optional_props.begin() + i, optional_props.end()), false);
}
if (!required_props.empty()) {
rule += " )";
}
rule += " )?";
}
rule += " \"}\" space";
return rule;
}
public:
SchemaConverter(
const std::function<json(const std::string &)> & fetch_json,
bool dotall)
: _fetch_json(fetch_json), _dotall(dotall)
{
_rules["space"] = SPACE_RULE;
}
void resolve_refs(json & schema, const std::string & url) {
/*
* Resolves all $ref fields in the given schema, fetching any remote schemas,
* replacing each $ref with absolute reference URL and populates _refs with the
* respective referenced (sub)schema dictionaries.
*/
std::function<void(json &)> visit_refs = [&](json & n) {
if (n.is_array()) {
for (auto & x : n) {
visit_refs(x);
}
} else if (n.is_object()) {
if (n.contains("$ref")) {
std::string ref = n["$ref"];
if (_refs.find(ref) == _refs.end()) {
json target;
if (ref.find("https://") == 0) {
std::string base_url = ref.substr(0, ref.find('#'));
auto it = _refs.find(base_url);
if (it != _refs.end()) {
target = it->second;
} else {
// Fetch the referenced schema and resolve its refs
auto referenced = _fetch_json(ref);
resolve_refs(referenced, base_url);
_refs[base_url] = referenced;
}
if (ref.find('#') == std::string::npos || ref.substr(ref.find('#') + 1).empty()) {
return;
}
} else if (ref.find("#/") == 0) {
target = schema;
n["$ref"] = url + ref;
ref = url + ref;
} else {
_errors.push_back("Unsupported ref: " + ref);
return;
}
std::string pointer = ref.substr(ref.find('#') + 1);
std::vector<std::string> tokens = split(pointer, "/");
for (size_t i = 1; i < tokens.size(); ++i) {
std::string sel = tokens[i];
if (target.is_null() || !target.contains(sel)) {
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
return;
}
target = target[sel];
}
_refs[ref] = target;
}
} else {
for (auto & kv : n.items()) {
visit_refs(kv.value());
}
}
}
};
visit_refs(schema);
}
std::string _generate_constant_rule(const json & value) {
return format_literal(value.dump());
}
std::string visit(const json & schema, const std::string & name) {
json schema_type = schema.contains("type") ? schema["type"] : json();
std::string schema_format = schema.contains("format") ? schema["format"].get<std::string>() : "";
std::string rule_name = is_reserved_name(name) ? name + "-" : name.empty() ? "root" : name;
if (schema.contains("$ref")) {
return _add_rule(rule_name, _resolve_ref(schema["$ref"]));
} else if (schema.contains("oneOf") || schema.contains("anyOf")) {
std::vector<json> alt_schemas = schema.contains("oneOf") ? schema["oneOf"].get<std::vector<json>>() : schema["anyOf"].get<std::vector<json>>();
return _add_rule(rule_name, _generate_union_rule(name, alt_schemas));
} else if (schema_type.is_array()) {
std::vector<json> schema_types;
for (const auto & t : schema_type) {
schema_types.push_back({{"type", t}});
}
return _add_rule(rule_name, _generate_union_rule(name, schema_types));
} else if (schema.contains("const")) {
return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
} else if (schema.contains("enum")) {
std::vector<std::string> enum_values;
for (const auto & v : schema["enum"]) {
enum_values.push_back(_generate_constant_rule(v));
}
return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
} else if ((schema_type.is_null() || schema_type == "object")
&& (schema.contains("properties") ||
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
std::unordered_set<std::string> required;
if (schema.contains("required") && schema["required"].is_array()) {
for (const auto & item : schema["required"]) {
if (item.is_string()) {
required.insert(item.get<std::string>());
}
}
}
std::vector<std::pair<std::string, json>> properties;
if (schema.contains("properties")) {
for (const auto & prop : schema["properties"].items()) {
properties.emplace_back(prop.key(), prop.value());
}
}
return _add_rule(rule_name,
_build_object_rule(
properties, required, name,
schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
} else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
std::unordered_set<std::string> required;
std::vector<std::pair<std::string, json>> properties;
std::string hybrid_name = name;
std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
if (comp_schema.contains("$ref")) {
add_component(_refs[comp_schema["$ref"]], is_required);
} else if (comp_schema.contains("properties")) {
for (const auto & prop : comp_schema["properties"].items()) {
properties.emplace_back(prop.key(), prop.value());
if (is_required) {
required.insert(prop.key());
}
}
} else {
// todo warning
}
};
for (auto & t : schema["allOf"]) {
if (t.contains("anyOf")) {
for (auto & tt : t["anyOf"]) {
add_component(tt, false);
}
} else {
add_component(t, true);
}
}
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
} else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
if (items.is_array()) {
std::string rule = "\"[\" space ";
for (size_t i = 0; i < items.size(); i++) {
if (i > 0) {
rule += " \",\" space ";
}
rule += visit(items[i], name + (name.empty() ? "" : "-") + "tuple-" + std::to_string(i));
}
rule += " \"]\" space";
return _add_rule(rule_name, rule);
} else {
std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
std::string list_item_operator = "( \",\" space " + item_rule_name + " )";
std::string successive_items;
int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : -1;
if (min_items > 0) {
successive_items += repeat(list_item_operator, min_items - 1);
min_items--;
}
if (max_items >= 0 && max_items > min_items) {
successive_items += repeat(list_item_operator + "?", max_items - min_items - 1);
} else {
successive_items += list_item_operator + "*";
}
std::string rule;
if (min_items == 0) {
rule = "\"[\" space ( " + item_rule_name + " " + successive_items + " )? \"]\" space";
} else {
rule = "\"[\" space " + item_rule_name + " " + successive_items + " \"]\" space";
}
return _add_rule(rule_name, rule);
}
} else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
return _visit_pattern(schema["pattern"], rule_name);
} else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
return _add_rule(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
} else if ((schema_type.is_null() || schema_type == "string") && DATE_RULES.find(schema_format) != DATE_RULES.end()) {
for (const auto & kv : DATE_RULES) {
_add_rule(kv.first, kv.second);
}
return schema_format + "-string";
} else if (schema.empty() || schema_type == "object") {
for (const auto & n : OBJECT_RULE_NAMES) {
_add_rule(n, PRIMITIVE_RULES.at(n));
}
return _add_rule(rule_name, "object");
} else {
if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
_errors.push_back("Unrecognized schema: " + schema.dump());
return "";
}
// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
return _add_rule(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
}
}
void check_errors() {
if (!_errors.empty()) {
throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
}
if (!_warnings.empty()) {
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
}
}
std::string format_grammar() {
std::stringstream ss;
for (const auto & kv : _rules) {
ss << kv.first << " ::= " << kv.second << std::endl;
}
return ss.str();
}
};
std::string json_schema_to_grammar(const json & schema) {
SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
auto copy = schema;
converter.resolve_refs(copy, "input");
converter.visit(copy, "");
converter.check_errors();
return converter.format_grammar();
}

View File

@@ -1,4 +0,0 @@
#pragma once
#include "json.hpp"
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);

View File

@@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// INTERNAL, DO NOT USE
// USE LOG() INSTEAD
//
#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
#ifndef _MSC_VER
#define LOG_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
@@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// INTERNAL, DO NOT USE
// USE LOG_TEE() INSTEAD
//
#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
#ifndef _MSC_VER
#define LOG_TEE_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
@@ -297,7 +297,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#ifndef _MSC_VER
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
#define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
#endif
// Main TEE macro.
@@ -311,7 +311,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#ifndef _MSC_VER
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
#endif
// LOG macro variants with auto endline.
@@ -319,8 +319,8 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n")
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
#endif
// INTERNAL, DO NOT USE
@@ -566,7 +566,6 @@ inline void log_print_usage()
printf(" --log-new Create a separate new log file on start. "
"Each log file will have unique name: \"<name>.<ID>.log\"\n");
printf(" --log-append Don't truncate the old log file.\n");
printf("\n");
}
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)

View File

@@ -1,282 +0,0 @@
#include "ngram-cache.h"
#include "common.h"
#include "log.h"
#include <cstdint>
#include <fstream>
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
std::vector<llama_token> & inp, int nnew, bool print_progress) {
const int64_t t_start_ms = ggml_time_ms();
const int64_t inp_size = inp.size();
const int64_t n_todo = inp_size * (ngram_max - ngram_min + 1);
int64_t n_done = 0;
for (int64_t ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) {
const int64_t i_start = std::max(inp_size - nnew, ngram_size);
for (int64_t i = i_start; i < inp_size; ++i) {
const int64_t ngram_start = i - ngram_size;
llama_ngram ngram(&inp[ngram_start], ngram_size);
const llama_token token = inp[i];
llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
if (part_it == ngram_cache.end()) {
llama_ngram_cache_part part;
part.emplace(token, 1);
ngram_cache.emplace(ngram, part);
} else {
llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
if (token_count_it == part_it->second.end()) {
part_it->second.emplace(token, 1);
} else {
token_count_it->second++;
}
}
++n_done;
if (print_progress && n_done % 10000000 == 0) {
const int64_t t_now_ms = ggml_time_ms();
const int64_t eta_ms = (inp_size*(ngram_max-ngram_min+1) - n_done) * (t_now_ms - t_start_ms) / n_done;
const int64_t eta_min = eta_ms / (60*1000);
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
fprintf(stderr, "%s: %" PRId64 "/%" PRId64 " done, ETA: %02" PRId64 ":%02" PRId64 "\n", __func__, n_done, n_todo, eta_min, eta_s);
}
}
}
}
// Helper function to get a token from the combined, speculative sequence of inp and draft.
static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) {
return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
}
// If sample size or percentage are below these thresholds the draft is aborted early:
constexpr int draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2, 2, 1, 1};
constexpr int draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50};
constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2};
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
// Helper function that tries to draft a token from only the static ngram cache:
static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
if (part_static_it == nc_static.end()) {
return -1;
}
const llama_ngram_cache_part part_static = part_static_it->second;
int max_count_static = 0;
int sum_count_static = 0;
llama_token max_token = -1;
for (std::pair<llama_token, int> token_count_static : part_static) {
const llama_token token = token_count_static.first;
const int32_t count_static = token_count_static.second;
if (count_static > max_count_static) {
max_token = token;
max_count_static = count_static;
}
sum_count_static += count_static;
}
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
return -1;
}
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
return -1;
}
return max_token;
}
// Try to draft a token from primary cache (context/dynamic), validate with static cache:
static llama_token try_draft(
llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
const int * min_sample_size, const int * min_percent) {
llama_token drafted_token = -1;
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
const llama_ngram ngram_primary = ngrams_primary[i];
llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
if (part_primary_it == nc_primary.end()) {
continue;
}
const llama_ngram_cache_part part_primary = part_primary_it->second;
int max_count_primary = 0;
int max_count_static = 0;
int sum_count_primary = 0;
llama_token max_token = -1;
for (std::pair<llama_token, int> token_count_primary : part_primary) {
const llama_token token = token_count_primary.first;
llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
const int32_t count_primary = token_count_primary.second;
const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
if (count_primary*count_static > max_count_primary*max_count_static) {
max_token = token;
max_count_primary = count_primary;
max_count_static = count_static;
}
sum_count_primary += count_primary;
}
if (sum_count_primary < min_sample_size[i]) {
continue;
}
if (100*max_count_primary < min_percent[i]*sum_count_primary) {
continue;;
}
drafted_token = max_token;
}
return drafted_token;
}
void llama_ngram_cache_draft(
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
) {
GGML_ASSERT(draft.size() == 1);
const int inp_size = inp.size();
if (inp_size < LLAMA_NGRAM_STATIC) {
return;
}
while ((int) draft.size()-1 < n_draft) {
llama_token drafted_token = -1;
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
llama_ngram ngram_static;
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
}
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
llama_ngram_cache_part part_static;
if (part_static_it != nc_static.end()) {
part_static = part_static_it->second;
}
// cd = context + dynamic
std::vector<llama_ngram> ngrams_cd;
for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
llama_ngram ngram_cd;
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
}
ngrams_cd.push_back(ngram_cd);
}
if (drafted_token == -1) {
drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
}
if (drafted_token == -1) {
drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
}
if (drafted_token == -1) {
drafted_token = try_draft(nc_static, ngram_static);
}
if (drafted_token == -1) {
break;
}
LOG(" - draft candidate: token=%d\n", drafted_token);
draft.push_back(drafted_token);
}
}
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
std::ofstream file_out(filename, std::ios::binary);
for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
const llama_ngram ngram = item.first;
llama_ngram_cache_part token_counts = item.second;
GGML_ASSERT(!token_counts.empty());
const int32_t ntokens = token_counts.size();
GGML_ASSERT(ntokens > 0);
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(llama_ngram));
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
for (std::pair<llama_token, int32_t> item2 : token_counts) {
const llama_token token = item2.first;
const int32_t count = item2.second;
GGML_ASSERT(count > 0);
file_out.write(reinterpret_cast<const char *>(&token), sizeof(llama_token));
file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
}
}
}
llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
std::ifstream hashmap_file(filename, std::ios::binary);
if (!hashmap_file) {
throw std::ifstream::failure("Unable to open file " + filename);
}
llama_ngram_cache ngram_cache;
llama_ngram ngram;
int32_t ntokens;
llama_token token;
int32_t count;
char * ngramc = reinterpret_cast<char*>(&ngram);
char * ntokensc = reinterpret_cast<char*>(&ntokens);
char * tokenc = reinterpret_cast<char*>(&token);
char * countc = reinterpret_cast<char*>(&count);
while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
GGML_ASSERT(!hashmap_file.eof());
GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
GGML_ASSERT(ntokens > 0);
llama_ngram_cache_part token_counts;
for (int i = 0; i < ntokens; ++i) {
GGML_ASSERT(!hashmap_file.eof());
GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token)));
GGML_ASSERT(!hashmap_file.eof());
GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
GGML_ASSERT(count > 0);
token_counts.emplace(token, count);
}
ngram_cache.emplace(ngram, token_counts);
}
GGML_ASSERT(hashmap_file.eof());
return ngram_cache;
}
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
const llama_ngram ngram = ngram_part.first;
llama_ngram_cache_part part = ngram_part.second;
llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
if (part_merged_it == ngram_cache_target.end()) {
ngram_cache_target.emplace(ngram, part);
continue;
}
for (std::pair<llama_token, int32_t> token_count : part) {
const llama_token token = token_count.first;
const int32_t count = token_count.second;
GGML_ASSERT(count > 0);
llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
if (token_count_merged_it == part_merged_it->second.end()) {
part_merged_it->second.emplace(token, count);
continue;
}
token_count_merged_it->second += count;
}
}
}

View File

@@ -1,94 +0,0 @@
#pragma once
#include "llama.h"
#include <unordered_map>
#include <string>
#include <vector>
#define LLAMA_NGRAM_MIN 1
#define LLAMA_NGRAM_MAX 4
#define LLAMA_NGRAM_STATIC 2
// Data structures to map n-grams to empirical token probabilities:
struct llama_ngram {
llama_token tokens[LLAMA_NGRAM_MAX];
llama_ngram() {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = -1;
}
}
llama_ngram(const llama_token * input, const int ngram_size) {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = i < ngram_size ? input[i] : -1;
}
}
bool operator==(const llama_ngram & other) const {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
if (tokens[i] != other.tokens[i]) {
return false;
}
}
return true;
}
};
struct llama_ngram_hash_function {
size_t operator()(const llama_ngram & ngram) const {
size_t hash = 0;
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
}
return hash;
}
};
// token -> number of times token has been seen
typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
// n-gram -> empirical distribution of following tokens
typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
// Update an ngram cache with tokens.
// ngram_cache: the cache to modify.
// ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data.
// inp_data: the token sequence with which to update ngram_cache.
// nnew: how many new tokens have been appended to inp_data since the last call to this function.
// print_progress: whether to print progress to stderr.
//
// In order to get correct results inp_data can ONLY BE APPENDED TO.
// Changes in the middle need a complete rebuild.
void llama_ngram_cache_update(
llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
// Try to draft tokens from ngram caches.
// inp: the tokens generated so far.
// draft: the token sequence to draft. Expected to initially contain the previously sampled token.
// n_draft: maximum number of tokens to add to draft.
// ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic.
// nc_context: ngram cache based on current context.
// nc_dynamic: ngram cache based on previous user generations.
// nc_static: ngram cache generated from a large text corpus, used for validation.
void llama_ngram_cache_draft(
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
// Save an ngram cache to a file.
// ngram_cache: the ngram cache to save.
// filename: the path under which to save the ngram cache.
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
// Load an ngram cache saved with llama_ngram_cache_save.
// filename: the path from which to load the ngram cache.
// returns: an ngram cache containing the information saved to filename.
llama_ngram_cache llama_ngram_cache_load(std::string & filename);
// Merge two ngram caches.
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
// ngram_cache_add: the ngram cache to add to ngram_cache_target.
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);

View File

@@ -17,13 +17,6 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
return nullptr;
}
// Ensure that there is a "root" node.
if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
delete result;
return nullptr;
}
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
result->grammar = llama_grammar_init(
@@ -168,110 +161,32 @@ static llama_token llama_sampling_sample_impl(
bool is_resampling) { // Add a parameter to indicate if we are resampling
const llama_sampling_params & params = ctx_sampling->params;
const float temp = params.temp;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
std::vector<float> original_logits;
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits);
if (!is_resampling) {
GGML_ASSERT(!original_logits.empty());
}
llama_token id = 0;
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);
if (temp < 0.0) {
// greedy sampling, with probs
llama_sample_softmax(ctx_main, &cur_p);
id = cur_p.data[0].id;
} else if (temp == 0.0) {
// greedy sampling, no probs
id = llama_sample_token_greedy(ctx_main, &cur_p);
} else {
if (mirostat == 1) {
const int mirostat_m = 100;
llama_sample_temp(ctx_main, &cur_p, temp);
id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
} else if (mirostat == 2) {
llama_sample_temp(ctx_main, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
} else {
// temperature sampling
size_t min_keep = std::max(1, params.min_keep);
sampler_queue(ctx_main, params, cur_p, min_keep);
id = llama_sample_token(ctx_main, &cur_p);
//{
// const int n_top = 10;
// LOG("top %d candidates:\n", n_top);
// for (int i = 0; i < n_top; i++) {
// const llama_token id = cur_p.data[i].id;
// (void)id; // To avoid a warning that id is unused when logging is disabled.
// LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
// }
//}
//LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
}
}
if (ctx_sampling->grammar != NULL && !is_resampling) {
// Create an array with a single token data element for the sampled id
llama_token_data single_token_data = {id, logits[id], 0.0f};
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
// Apply grammar constraints to the single token
llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
// If the token is not valid according to the grammar, perform resampling
if (!is_valid) {
LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
// Restore logits from the copy
std::copy(original_logits.begin(), original_logits.end(), logits);
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling
}
}
return id;
}
static llama_token_data_array llama_sampling_prepare_impl(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx,
bool apply_grammar,
std::vector<float> * original_logits) {
const llama_sampling_params & params = ctx_sampling->params;
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
const float temp = params.temp;
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
const float penalty_repeat = params.penalty_repeat;
const float penalty_freq = params.penalty_freq;
const float penalty_present = params.penalty_present;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
const bool penalize_nl = params.penalize_nl;
auto & prev = ctx_sampling->prev;
auto & cur = ctx_sampling->cur;
llama_token id = 0;
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);
if (apply_grammar && original_logits != NULL) {
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
// Declare original_logits at the beginning of the function scope
std::vector<float> original_logits;
if (!is_resampling) {
// Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this.
original_logits = std::vector<float>(logits, logits + llama_n_vocab(llama_get_model(ctx_main)));
}
// apply params.logit_bias map
@@ -312,12 +227,72 @@ static llama_token_data_array llama_sampling_prepare_impl(
}
}
// apply grammar checks before sampling logic
if (apply_grammar && ctx_sampling->grammar != NULL) {
// If we are in the resampling phase, apply grammar checks before sampling logic
if (is_resampling && ctx_sampling->grammar != NULL) {
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
}
return cur_p;
if (temp < 0.0) {
// greedy sampling, with probs
llama_sample_softmax(ctx_main, &cur_p);
id = cur_p.data[0].id;
} else if (temp == 0.0) {
// greedy sampling, no probs
id = llama_sample_token_greedy(ctx_main, &cur_p);
} else {
if (mirostat == 1) {
const int mirostat_m = 100;
llama_sample_temp(ctx_main, &cur_p, temp);
id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
} else if (mirostat == 2) {
llama_sample_temp(ctx_main, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
} else {
// temperature sampling
size_t min_keep = std::max(1, params.min_keep);
sampler_queue(ctx_main, params, cur_p, min_keep);
id = llama_sample_token(ctx_main, &cur_p);
//{
// const int n_top = 10;
// LOG("top %d candidates:\n", n_top);
// for (int i = 0; i < n_top; i++) {
// const llama_token id = cur_p.data[i].id;
// (void)id; // To avoid a warning that id is unused when logging is disabled.
// LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
// }
//}
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
}
}
if (ctx_sampling->grammar != NULL && !is_resampling) {
// Create an array with a single token data element for the sampled id
llama_token_data single_token_data = {id, logits[id], 0.0f};
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
// Apply grammar constraints to the single token
llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
// If the token is not valid according to the grammar, perform resampling
if (!is_valid) {
LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
// Restore logits from the copy
std::copy(original_logits.begin(), original_logits.end(), logits);
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling
}
}
return id;
}
llama_token llama_sampling_sample(
@@ -329,16 +304,6 @@ llama_token llama_sampling_sample(
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
}
llama_token_data_array llama_sampling_prepare(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx,
bool apply_grammar,
std::vector<float> * original_logits) {
return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
}
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,

View File

@@ -32,13 +32,13 @@ typedef struct llama_sampling_params {
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_repeat = 1.10f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool penalize_nl = true; // consider newlines as a repeatable token
std::vector<llama_sampler_type> samplers_sequence = {
llama_sampler_type::TOP_K,
@@ -131,15 +131,6 @@ llama_token llama_sampling_sample(
struct llama_context * ctx_cfg,
int idx = 0);
// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
llama_token_data_array llama_sampling_prepare(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
int idx = 0,
bool apply_grammar = true,
std::vector<float> * original_logits = nullptr);
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,

View File

@@ -31,7 +31,7 @@ struct train_state * init_train_state() {
state->opt = new struct ggml_opt_context;
state->opt->ctx = NULL;
state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
state->opt->loss_after = 0.0f;
@@ -556,7 +556,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
std::string opt_type;
GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
opt->params.type = GGML_OPT_TYPE_ADAM;
opt->params.type = GGML_OPT_ADAM;
GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
@@ -568,7 +568,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
} else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
opt->params.type = GGML_OPT_TYPE_LBFGS;
opt->params.type = GGML_OPT_LBFGS;
GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
@@ -603,7 +603,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
switch (opt->params.type) {
case GGML_OPT_TYPE_ADAM:
case GGML_OPT_ADAM:
{
gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best);
@@ -622,7 +622,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
gguf_add_tensor(fctx, opt->adam.pf);
}
} break;
case GGML_OPT_TYPE_LBFGS:
case GGML_OPT_LBFGS:
{
gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);

View File

@@ -8,10 +8,9 @@ import json
import os
import re
import sys
from abc import ABC, abstractmethod
from enum import IntEnum
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Sequence, cast
import numpy as np
import torch
@@ -23,7 +22,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
from convert import LlamaHfVocab
from convert import HfVocab
###### MODEL DEFINITIONS ######
@@ -37,12 +36,7 @@ class SentencePieceTokenTypes(IntEnum):
BYTE = 6
AnyModel = TypeVar("AnyModel", bound="type[Model]")
class Model(ABC):
_model_classes: dict[str, type[Model]] = {}
class Model:
def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
self.dir_model = dir_model
self.ftype = ftype
@@ -53,14 +47,10 @@ class Model(ABC):
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
self.part_names = self._get_part_names()
self.hparams = Model.load_hparams(self.dir_model)
self.model_arch = self._get_model_architecture()
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
@property
@abstractmethod
def model_arch(self) -> gguf.MODEL_ARCH:
pass
def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any:
key = next((k for k in keys if k in self.hparams), None)
if key is not None:
@@ -93,42 +83,29 @@ class Model(ABC):
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
self.gguf_writer.add_context_length(n_ctx)
print(f"gguf: context length = {n_ctx}")
n_embd = self.find_hparam(["hidden_size", "n_embd"])
self.gguf_writer.add_embedding_length(n_embd)
print(f"gguf: embedding length = {n_embd}")
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
self.gguf_writer.add_feed_forward_length(n_ff)
print(f"gguf: feed forward length = {n_ff}")
n_head = self.find_hparam(["num_attention_heads", "n_head"])
self.gguf_writer.add_head_count(n_head)
print(f"gguf: head count = {n_head}")
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
self.gguf_writer.add_head_count_kv(n_head_kv)
print(f"gguf: key-value head count = {n_head_kv}")
if (rope_theta := self.hparams.get("rope_theta")) is not None:
self.gguf_writer.add_rope_freq_base(rope_theta)
print(f"gguf: rope theta = {rope_theta}")
if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
print(f"gguf: rms norm epsilon = {f_rms_eps}")
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon"], optional=True)) is not None:
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
print(f"gguf: layer norm epsilon = {f_norm_eps}")
if (n_experts := self.hparams.get("num_local_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
print(f"gguf: expert count = {n_experts}")
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
print(f"gguf: experts used count = {n_experts_used}")
self.gguf_writer.add_file_type(self.ftype)
print(f"gguf: file type = {self.ftype}")
def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@@ -197,22 +174,53 @@ class Model(ABC):
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
return json.load(f)
@classmethod
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
assert names
def func(modelcls: type[Model]):
for name in names:
cls._model_classes[name] = modelcls
return modelcls
return func
@classmethod
def from_model_architecture(cls, arch):
try:
return cls._model_classes[arch]
except KeyError:
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
@staticmethod
def from_model_architecture(model_architecture):
if model_architecture == "GPTNeoXForCausalLM":
return GPTNeoXModel
if model_architecture == "BloomForCausalLM":
return BloomModel
if model_architecture == "MPTForCausalLM":
return MPTModel
if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
return BaichuanModel
if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
return FalconModel
if model_architecture == "GPTBigCodeForCausalLM":
return StarCoderModel
if model_architecture == "GPTRefactForCausalLM":
return RefactModel
if model_architecture == "PersimmonForCausalLM":
return PersimmonModel
if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
return StableLMModel
if model_architecture == "QWenLMHeadModel":
return QwenModel
if model_architecture == "Qwen2ForCausalLM":
return Model
if model_architecture == "MixtralForCausalLM":
return MixtralModel
if model_architecture == "GPT2LMHeadModel":
return GPT2Model
if model_architecture == "PhiForCausalLM":
return Phi2Model
if model_architecture == "PlamoForCausalLM":
return PlamoModel
if model_architecture == "CodeShellForCausalLM":
return CodeShellModel
if model_architecture == "OrionForCausalLM":
return OrionModel
if model_architecture == "InternLM2ForCausalLM":
return InternLM2Model
if model_architecture == "MiniCPMForCausalLM":
return MiniCPMModel
if model_architecture == "BertModel":
return BertModel
if model_architecture == "NomicBertModel":
return NomicBertModel
if model_architecture == "GemmaForCausalLM":
return GemmaModel
return Model
def _is_model_safetensors(self) -> bool:
return Model.count_model_parts(self.dir_model, ".safetensors") > 0
@@ -227,10 +235,59 @@ class Model(ABC):
return ("pytorch_model.bin",)
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
def _get_model_architecture(self) -> gguf.MODEL_ARCH:
arch = self.hparams["architectures"][0]
if arch == "GPTNeoXForCausalLM":
return gguf.MODEL_ARCH.GPTNEOX
if arch == "BloomForCausalLM":
return gguf.MODEL_ARCH.BLOOM
if arch == "MPTForCausalLM":
return gguf.MODEL_ARCH.MPT
if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
return gguf.MODEL_ARCH.BAICHUAN
if arch in ("FalconForCausalLM", "RWForCausalLM"):
return gguf.MODEL_ARCH.FALCON
if arch == "GPTBigCodeForCausalLM":
return gguf.MODEL_ARCH.STARCODER
if arch == "GPTRefactForCausalLM":
return gguf.MODEL_ARCH.REFACT
if arch == "PersimmonForCausalLM":
return gguf.MODEL_ARCH.PERSIMMON
if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
return gguf.MODEL_ARCH.STABLELM
if arch == "QWenLMHeadModel":
return gguf.MODEL_ARCH.QWEN
if arch == "Qwen2ForCausalLM":
return gguf.MODEL_ARCH.QWEN2
if arch == "MixtralForCausalLM":
return gguf.MODEL_ARCH.LLAMA
if arch == "GPT2LMHeadModel":
return gguf.MODEL_ARCH.GPT2
if arch == "PhiForCausalLM":
return gguf.MODEL_ARCH.PHI2
if arch == "PlamoForCausalLM":
return gguf.MODEL_ARCH.PLAMO
if arch == "CodeShellForCausalLM":
return gguf.MODEL_ARCH.CODESHELL
if arch == "OrionForCausalLM":
return gguf.MODEL_ARCH.ORION
if arch == "InternLM2ForCausalLM":
return gguf.MODEL_ARCH.INTERNLM2
if arch == "MiniCPMForCausalLM":
return gguf.MODEL_ARCH.MINICPM
if arch == "BertModel":
return gguf.MODEL_ARCH.BERT
if arch == "NomicBertModel":
return gguf.MODEL_ARCH.NOMIC_BERT
if arch == "GemmaForCausalLM":
return gguf.MODEL_ARCH.GEMMA
raise NotImplementedError(f'Architecture "{arch}" not supported!')
def _set_vocab_gpt2(self):
dir_model = self.dir_model
hparams = self.hparams
tokens: list[str] = []
tokens: list[bytearray] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
@@ -243,7 +300,8 @@ class Model(ABC):
for i in range(vocab_size):
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
pad_token = f"[PAD{i}]".encode('utf-8')
tokens.append(bytearray(pad_token))
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
@@ -265,7 +323,7 @@ class Model(ABC):
def _set_vocab_qwen(self):
dir_model = self.dir_model
hparams = self.hparams
tokens: list[str] = []
tokens: list[bytearray] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
@@ -290,7 +348,8 @@ class Model(ABC):
for i in range(vocab_size):
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
pad_token = f"[PAD{i}]".encode("utf-8")
tokens.append(bytearray(pad_token))
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
@@ -329,7 +388,7 @@ class Model(ABC):
tokenizer = SentencePieceProcessor(str(tokenizer_path))
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
for token_id in range(tokenizer.vocab_size()):
for token_id in range(vocab_size):
piece = tokenizer.id_to_piece(token_id)
text = piece.encode("utf-8")
score = tokenizer.get_score(token_id)
@@ -354,13 +413,9 @@ class Model(ABC):
added_tokens_json = json.load(f)
for key in added_tokens_json:
key = key.encode("utf-8")
if key not in tokens:
tokens.append(key)
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
assert len(tokens) == vocab_size
tokens.append(key.encode("utf-8"))
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_token_list(tokens)
@@ -370,8 +425,12 @@ class Model(ABC):
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_llama_hf(self):
vocab = LlamaHfVocab(self.dir_model)
def _set_vocab_hf(self):
path = self.dir_model
added_tokens_path = self.dir_model
vocab = HfVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
tokens = []
scores = []
toktypes = []
@@ -392,10 +451,7 @@ class Model(ABC):
special_vocab.add_to_gguf(self.gguf_writer)
@Model.register("GPTNeoXForCausalLM")
class GPTNeoXModel(Model):
model_arch = gguf.MODEL_ARCH.GPTNEOX
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
@@ -412,10 +468,7 @@ class GPTNeoXModel(Model):
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
@Model.register("BloomForCausalLM")
class BloomModel(Model):
model_arch = gguf.MODEL_ARCH.BLOOM
def set_gguf_parameters(self):
self.gguf_writer.add_name("Bloom")
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
@@ -507,10 +560,7 @@ class BloomModel(Model):
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
@Model.register("MPTForCausalLM")
class MPTModel(Model):
model_arch = gguf.MODEL_ARCH.MPT
def set_gguf_parameters(self):
block_count = self.hparams["n_layers"]
self.gguf_writer.add_name(self.dir_model.name)
@@ -573,10 +623,7 @@ class MPTModel(Model):
self.gguf_writer.add_tensor(new_name, data)
@Model.register("OrionForCausalLM")
class OrionModel(Model):
model_arch = gguf.MODEL_ARCH.ORION
def set_vocab(self):
self._set_vocab_sentencepiece()
@@ -655,10 +702,7 @@ class OrionModel(Model):
self.gguf_writer.add_tensor(new_name, data)
@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
class BaichuanModel(Model):
model_arch = gguf.MODEL_ARCH.BAICHUAN
def set_vocab(self):
self._set_vocab_sentencepiece()
@@ -773,10 +817,7 @@ class BaichuanModel(Model):
return weights[r * n_part:r * n_part + r, ...]
@Model.register("FalconForCausalLM", "RWForCausalLM")
class FalconModel(Model):
model_arch = gguf.MODEL_ARCH.FALCON
def set_gguf_parameters(self):
block_count = self.hparams.get("num_hidden_layers")
if block_count is None:
@@ -869,10 +910,7 @@ class FalconModel(Model):
self.gguf_writer.add_tensor(new_name, data)
@Model.register("GPTBigCodeForCausalLM")
class StarCoderModel(Model):
model_arch = gguf.MODEL_ARCH.STARCODER
def set_gguf_parameters(self):
block_count = self.hparams["n_layer"]
@@ -887,10 +925,7 @@ class StarCoderModel(Model):
self.gguf_writer.add_file_type(self.ftype)
@Model.register("GPTRefactForCausalLM")
class RefactModel(Model):
model_arch = gguf.MODEL_ARCH.REFACT
def set_gguf_parameters(self):
hidden_dim = self.hparams["n_embd"]
inner_dim = 4 * hidden_dim
@@ -974,10 +1009,7 @@ class RefactModel(Model):
self.gguf_writer.add_tensor(new_name, data)
@Model.register("PersimmonForCausalLM")
class PersimmonModel(Model):
model_arch = gguf.MODEL_ARCH.PERSIMMON
def set_gguf_parameters(self):
block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
head_count = self.hparams["num_attention_heads"]
@@ -1025,10 +1057,7 @@ class PersimmonModel(Model):
self.gguf_writer.add_tensor(new_name, data)
@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
class StableLMModel(Model):
model_arch = gguf.MODEL_ARCH.STABLELM
def set_vocab(self):
if (self.dir_model / "tokenizer.json").is_file():
self._set_vocab_gpt2()
@@ -1045,40 +1074,18 @@ class StableLMModel(Model):
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
self.gguf_writer.add_layer_norm_eps(1e-5)
@Model.register("MixtralForCausalLM")
class MixtralModel(Model):
model_arch = gguf.MODEL_ARCH.LLAMA
def set_vocab(self):
self._set_vocab_sentencepiece()
@Model.register("GrokForCausalLM")
class GrokModel(Model):
model_arch = gguf.MODEL_ARCH.GROK
def set_vocab(self):
self._set_vocab_sentencepiece()
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_name("Grok")
@Model.register("MiniCPMForCausalLM")
class MiniCPMModel(Model):
model_arch = gguf.MODEL_ARCH.MINICPM
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
self.gguf_writer.add_name("MiniCPM")
@@ -1093,7 +1100,7 @@ class MiniCPMModel(Model):
self.gguf_writer.add_file_type(self.ftype)
def set_vocab(self):
self._set_vocab_llama_hf()
self._set_vocab_hf()
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
if n_kv_head is not None and n_head != n_kv_head:
@@ -1155,10 +1162,7 @@ class MiniCPMModel(Model):
self.gguf_writer.add_tensor(new_name, data)
@Model.register("QWenLMHeadModel")
class QwenModel(Model):
model_arch = gguf.MODEL_ARCH.QWEN
@staticmethod
def token_bytes_to_string(b):
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
@@ -1238,15 +1242,7 @@ class QwenModel(Model):
self.gguf_writer.add_tensor(new_name, data)
@Model.register("Qwen2ForCausalLM")
class Qwen2Model(Model):
model_arch = gguf.MODEL_ARCH.QWEN2
@Model.register("GPT2LMHeadModel")
class GPT2Model(Model):
model_arch = gguf.MODEL_ARCH.GPT2
def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_block_count(self.hparams["n_layer"])
@@ -1308,10 +1304,7 @@ class GPT2Model(Model):
self.gguf_writer.add_tensor("output.weight", data)
@Model.register("PhiForCausalLM")
class Phi2Model(Model):
model_arch = gguf.MODEL_ARCH.PHI2
def set_gguf_parameters(self):
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
@@ -1333,10 +1326,7 @@ class Phi2Model(Model):
self.gguf_writer.add_add_bos_token(False)
@Model.register("PlamoForCausalLM")
class PlamoModel(Model):
model_arch = gguf.MODEL_ARCH.PLAMO
def set_vocab(self):
self._set_vocab_sentencepiece()
@@ -1415,10 +1405,7 @@ class PlamoModel(Model):
self.gguf_writer.add_tensor(new_name, data)
@Model.register("CodeShellForCausalLM")
class CodeShellModel(Model):
model_arch = gguf.MODEL_ARCH.CODESHELL
def set_gguf_parameters(self):
block_count = self.hparams["n_layer"]
@@ -1483,10 +1470,7 @@ class CodeShellModel(Model):
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
@Model.register("InternLM2ForCausalLM")
class InternLM2Model(Model):
model_arch = gguf.MODEL_ARCH.INTERNLM2
def set_vocab(self):
# (TODO): Is there a better way?
# Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
@@ -1658,10 +1642,7 @@ in chat mode so that the conversation can end normally.")
self.post_write_tensors(tensor_map, name, data_torch)
@Model.register("BertModel", "CamembertModel")
class BertModel(Model):
model_arch = gguf.MODEL_ARCH.BERT
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.vocab_size = None
@@ -1671,17 +1652,16 @@ class BertModel(Model):
self.gguf_writer.add_causal_attention(False)
# get pooling path
with open(self.dir_model / "modules.json", encoding="utf-8") as f:
modules = json.load(f)
pooling_path = None
module_path = self.dir_model / "modules.json"
if module_path.is_file():
with open(module_path, encoding="utf-8") as f:
modules = json.load(f)
for mod in modules:
if mod["type"] == "sentence_transformers.models.Pooling":
pooling_path = mod["path"]
break
for mod in modules:
if mod["type"] == "sentence_transformers.models.Pooling":
pooling_path = mod["path"]
break
# get pooling type
pooling_type = gguf.PoolingType.NONE
if pooling_path is not None:
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
pooling = json.load(f)
@@ -1691,11 +1671,15 @@ class BertModel(Model):
pooling_type = gguf.PoolingType.CLS
else:
raise NotImplementedError("Only MEAN and CLS pooling types supported")
self.gguf_writer.add_pooling_type(pooling_type)
self.gguf_writer.add_pooling_type(pooling_type.value)
def set_vocab(self):
path = self.dir_model
added_tokens_path = self.dir_model if self.dir_model.exists() else None
# use huggingface vocab to get all tokens
vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
vocab = HfVocab(path, added_tokens_path)
tokens, scores, toktypes = zip(*vocab.all_tokens())
assert len(tokens) == vocab.vocab_size
self.vocab_size = vocab.vocab_size
@@ -1764,10 +1748,7 @@ class BertModel(Model):
self.gguf_writer.add_tensor(new_name, data)
@Model.register("NomicBertModel")
class NomicBertModel(BertModel):
model_arch = gguf.MODEL_ARCH.NOMIC_BERT
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@@ -1804,10 +1785,7 @@ class NomicBertModel(BertModel):
yield name, data
@Model.register("GemmaForCausalLM")
class GemmaModel(Model):
model_arch = gguf.MODEL_ARCH.GEMMA
def set_vocab(self):
self._set_vocab_sentencepiece()
@@ -1825,22 +1803,22 @@ class GemmaModel(Model):
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_key_length(hparams["head_dim"])
self.gguf_writer.add_value_length(hparams["head_dim"])
self.gguf_writer.add_file_type(self.ftype)
def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
for name, data_torch in self.get_tensors():
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
if name.endswith("norm.weight"):
data_torch = data_torch + 1
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
if name.endswith("norm.weight"):
data_torch = data_torch + 1
data = data_torch.squeeze().numpy()
# map tensor names
@@ -1863,146 +1841,6 @@ class GemmaModel(Model):
self.gguf_writer.add_tensor(new_name, data)
@Model.register("Starcoder2ForCausalLM")
class StarCoder2Model(Model):
model_arch = gguf.MODEL_ARCH.STARCODER2
@Model.register("MambaForCausalLM", "MambaLMHeadModel")
class MambaModel(Model):
model_arch = gguf.MODEL_ARCH.MAMBA
def set_vocab(self):
vocab_size = self.hparams["vocab_size"]
# Round vocab size to next multiple of 8
pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8)
# pad using ceiling division
# ref: https://stackoverflow.com/a/17511341/22827863
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
self.hparams["vocab_size"] = vocab_size
if (self.dir_model / "tokenizer.json").is_file():
self._set_vocab_gpt2()
else:
# Use the GPT-NeoX tokenizer when no tokenizer files are present
tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
neox_reader = gguf.GGUFReader(tokenizer_path, "r")
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
def set_gguf_parameters(self):
d_model = self.find_hparam(["hidden_size", "d_model"])
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
# ceiling division
# ref: https://stackoverflow.com/a/17511341/22827863
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
# Fail early for models which don't have a block expansion factor of 2
assert d_inner == 2 * d_model
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
self.gguf_writer.add_embedding_length(d_model)
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
self.gguf_writer.add_block_count(self.hparams["n_layer"])
self.gguf_writer.add_ssm_conv_kernel(d_conv)
self.gguf_writer.add_ssm_inner_size(d_inner)
self.gguf_writer.add_ssm_state_size(d_state)
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_file_type(self.ftype)
def write_tensors(self):
block_count = self.hparams["n_layer"]
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
tok_embd = None
tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight"
output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight"
for name, data_torch in self.get_tensors():
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
if name.endswith(".A_log"):
print("A_log --> A ==> " + new_name)
data_torch = -torch.exp(data_torch)
# assuming token_embd.weight is seen before output.weight
if tok_embd is not None and new_name == output_name:
if torch.equal(tok_embd, data_torch):
print(f"{output_name} is equivalent to {tok_embd_name}, omitting")
continue
if new_name == tok_embd_name:
tok_embd = data_torch
data = data_torch.squeeze().numpy()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert big float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
@Model.register("CohereForCausalLM")
class CommandR2Model(Model):
model_arch = gguf.MODEL_ARCH.COMMAND_R
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# max_position_embeddings = 8192 in config.json but model was actually
# trained on 128k context length
self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
###### CONVERSION LOGIC ######

View File

@@ -373,7 +373,7 @@ def handle_metadata(cfg, hp):
raise ValueError('Unable to load metadata')
vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
vocab_factory = convert.VocabFactory(vocab_path)
vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir)
vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
convert.check_vocab_size(params, vocab)
return params, vocab, special_vocab
@@ -398,8 +398,8 @@ def handle_args():
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
parser.add_argument("--vocab-dir", type=Path,
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", default="spm,hfft",
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
return parser.parse_args()

View File

@@ -106,12 +106,12 @@ def main():
tensor_map = gguf.get_tensor_name_map(arch, block_count)
print(tensor_map)
for name in tensors.keys():
data_torch = tensors[name]
data = tensors[name]
if name.endswith(".self_attention.rotary_emb.inv_freq"):
continue
old_dtype = data_torch.dtype
old_dtype = data.dtype
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
data = data_torch.to(torch.float32).squeeze().numpy()
data = data.to(torch.float32).squeeze().numpy()
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")

View File

@@ -16,14 +16,13 @@ import re
import signal
import struct
import sys
import textwrap
import time
import zipfile
from abc import ABC, abstractmethod
from abc import ABCMeta, abstractmethod
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
import numpy as np
from sentencepiece import SentencePieceProcessor
@@ -44,9 +43,6 @@ ARCH = gguf.MODEL_ARCH.LLAMA
DEFAULT_CONCURRENCY = 8
ADDED_TOKENS_FILE = 'added_tokens.json'
FAST_TOKENIZER_FILE = 'tokenizer.json'
#
# data types
#
@@ -192,10 +188,8 @@ class Params:
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
if n_layer < 1:
msg = """\
failed to guess 'n_layer'. This model is unknown or unsupported.
Suggestion: provide 'config.json' of the model in the same directory containing model files."""
raise KeyError(textwrap.dedent(msg))
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
n_head = n_embd // 128 # guessed
n_mult = 256 # guessed
@@ -217,8 +211,7 @@ class Params:
@staticmethod
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
with open(config_path) as f:
config = json.load(f)
config = json.load(open(config_path))
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
rope_scaling = config.get("rope_scaling")
@@ -240,10 +233,8 @@ class Params:
elif "max_position_embeddings" in config:
n_ctx = config["max_position_embeddings"]
else:
msg = """\
failed to guess 'n_ctx'. This model is unknown or unsupported.
Suggestion: provide 'config.json' of the model in the same directory containing model files."""
raise KeyError(textwrap.dedent(msg))
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
n_experts = None
n_experts_used = None
@@ -274,8 +265,7 @@ class Params:
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
@staticmethod
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
with open(config_path) as f:
config = json.load(f)
config = json.load(open(config_path))
n_experts = None
n_experts_used = None
@@ -341,86 +331,44 @@ class Params:
# vocab
#
@runtime_checkable
class BaseVocab(Protocol):
tokenizer_model: ClassVar[str]
name: ClassVar[str]
class NoVocab(BaseVocab):
tokenizer_model = "no_vocab"
name = "no_vocab"
def __repr__(self) -> str:
return "<NoVocab for a model without integrated vocabulary>"
@runtime_checkable
class Vocab(BaseVocab, Protocol):
vocab_size: int
added_tokens_dict: dict[str, int]
added_tokens_list: list[str]
fname_tokenizer: Path
def __init__(self, base_path: Path): ...
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
class BpeVocab(Vocab):
tokenizer_model = "gpt2"
name = "bpe"
def __init__(self, base_path: Path):
added_tokens: dict[str, int] = {}
if (fname_tokenizer := base_path / 'vocab.json').exists():
# "slow" tokenizer
with open(fname_tokenizer, encoding="utf-8") as f:
self.vocab = json.load(f)
try:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
added_tokens = json.load(f)
except FileNotFoundError:
pass
class BpeVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
if isinstance(self.bpe_tokenizer.get('model'), dict):
self.vocab = self.bpe_tokenizer["model"]["vocab"]
else:
# "fast" tokenizer
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
self.vocab = self.bpe_tokenizer
added_tokens: dict[str, int]
if fname_added_tokens is not None:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else:
# Fall back to trying to find the added tokens in tokenizer.json
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
if not tokenizer_json_file.is_file():
added_tokens = {}
else:
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
added_tokens = dict(
(item['content'], item['id'])
for item in tokenizer_json.get('added_tokens', [])
# Added tokens here can be duplicates of the main vocabulary.
if item['content'] not in self.bpe_tokenizer)
# if this fails, FileNotFoundError propagates to caller
with open(fname_tokenizer, encoding="utf-8") as f:
tokenizer_json = json.load(f)
tokenizer_model: dict[str, Any] = tokenizer_json['model']
if (
tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'ByteLevel'
):
raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
self.vocab = tokenizer_model["vocab"]
if (added := tokenizer_json.get('added_tokens')) is not None:
# Added tokens here can be duplicates of the main vocabulary.
added_tokens = {item['content']: item['id']
for item in added
if item['content'] not in self.vocab}
vocab_size = len(self.vocab)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
vocab_size: int = len(self.vocab)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
expected_end_id = vocab_size + len(actual_ids) - 1
raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
f"{vocab_size} - {expected_end_id}; got {actual_ids}")
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_dict = added_tokens
self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.vocab_size_base: int = vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
@@ -441,25 +389,16 @@ class BpeVocab(Vocab):
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class SentencePieceVocab(Vocab):
tokenizer_model = "llama"
name = "spm"
def __init__(self, base_path: Path):
added_tokens: dict[str, int] = {}
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
# normal location
try:
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
added_tokens = json.load(f)
except FileNotFoundError:
pass
elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
# not found in alternate location either
raise FileNotFoundError('Cannot find tokenizer.model')
class SentencePieceVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
vocab_size = self.sentencepiece_tokenizer.vocab_size()
added_tokens: dict[str, int]
if fname_added_tokens is not None:
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else:
added_tokens = {}
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
@@ -469,17 +408,18 @@ class SentencePieceVocab(Vocab):
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
# Token pieces that were added to the base vocabulary.
self.added_tokens_dict = added_tokens
self.added_tokens_dict = added_tokens
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.sentencepiece_tokenizer
for i in range(tokenizer.vocab_size()):
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
text: bytes = piece.encode("utf-8")
score: float = tokenizer.get_score(i)
toktype = gguf.TokenType.NORMAL
@@ -512,42 +452,24 @@ class SentencePieceVocab(Vocab):
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class LlamaHfVocab(Vocab):
tokenizer_model = "llama"
name = "hfft"
def __init__(self, base_path: Path, ignore_nonllama: bool = False):
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
# if this fails, FileNotFoundError propagates to caller
with open(fname_tokenizer, encoding='utf-8') as f:
tokenizer_json = json.load(f)
# pre-check so we know if we need transformers
tokenizer_model: dict[str, Any] = tokenizer_json['model']
if ignore_nonllama:
pass # workaround incorrect use of this class for WordPiece
elif (
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'Sequence'
):
raise FileNotFoundError('Cannot find Llama BPE tokenizer')
class HfVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
try:
from transformers import AutoTokenizer
except ImportError as e:
raise ImportError(
"To use LlamaHfVocab, please install the `transformers` package. "
"To use HfVocab, please install the `transformers` package. "
"You can install it with `pip install transformers`."
) from e
print("fname_tokenizer:", fname_tokenizer)
# Allow the tokenizer to default to slow or fast versions.
# Explicitly set tokenizer to use local paths.
self.tokenizer = AutoTokenizer.from_pretrained(
base_path,
cache_dir=base_path,
fname_tokenizer,
cache_dir=fname_tokenizer,
local_files_only=True,
)
assert self.tokenizer.is_fast # assume tokenizer.json is used
# Initialize lists and dictionaries for added tokens
self.added_tokens_list = []
@@ -575,7 +497,8 @@ class LlamaHfVocab(Vocab):
self.vocab_size_base = self.tokenizer.vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = {
@@ -627,7 +550,10 @@ class LlamaHfVocab(Vocab):
yield from self.added_tokens()
def __repr__(self) -> str:
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
#
@@ -645,7 +571,7 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
.reshape(weights.shape))
class Tensor(ABC):
class Tensor(metaclass=ABCMeta):
data_type: DataType
@abstractmethod
@@ -667,7 +593,7 @@ def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
class UnquantizedTensor(Tensor):
def __init__(self, ndarray: NDArray):
def __init__(self, ndarray: NDArray) -> None:
assert isinstance(ndarray, np.ndarray)
self.ndarray = ndarray
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
@@ -746,7 +672,7 @@ class ModelPlus:
model: LazyModel
paths: list[Path] # Where this was read from.
format: Literal['ggml', 'torch', 'safetensors', 'none']
vocab: BaseVocab | None # For GGML models (which have vocab built in), the vocab.
vocab: Vocab | None # For GGML models (which have vocab built in), the vocab.
def merge_sharded(models: list[LazyModel]) -> LazyModel:
@@ -755,7 +681,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
names = {name: None for model in models for name in model}
def convert(name: str) -> LazyTensor:
lazy_tensors = [model[name] for model in models]
lazy_tensors: list[LazyTensor] = [model[name] for model in models]
if len(lazy_tensors) == 1:
# only one file; don't go through this procedure since there might
# be quantized tensors
@@ -776,7 +702,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
def load() -> UnquantizedTensor:
ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
concatenated = np.concatenate(ndarrays, axis=axis)
concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
return UnquantizedTensor(concatenated)
description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
@@ -864,10 +790,10 @@ class LazyUnpickler(pickle.Unpickler):
def load(offset: int, elm_count: int) -> NDArray:
dtype = data_type.dtype
with self.zip_file.open(info) as fp:
fp.seek(offset * dtype.itemsize)
size = elm_count * dtype.itemsize
data = fp.read(size)
fp = self.zip_file.open(info)
fp.seek(offset * dtype.itemsize)
size = elm_count * dtype.itemsize
data = fp.read(size)
assert len(data) == size
return np.frombuffer(data, dtype)
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
@@ -888,7 +814,7 @@ class LazyUnpickler(pickle.Unpickler):
def rebuild_from_type_v2(func, new_type, args, state):
return func(*args)
CLASSES = {
CLASSES: dict[tuple[str, str], Any] = {
# getattr used here as a workaround for mypy not being smart enough to determine
# the staticmethods have a __func__ attribute.
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
@@ -947,7 +873,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
def must_read(fp: IO[bytes], length: int) -> bytes:
ret = fp.read(length)
if len(ret) < length:
raise EOFError("unexpectedly reached end of file")
raise Exception("unexpectedly reached end of file")
return ret
@@ -1005,15 +931,12 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
yield result
def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None:
def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
# Handle special case where the model's vocab size is not set
if params.n_vocab == -1:
raise ValueError(
"The model's vocab size is set to -1 in params.json. Please update it manually."
+ (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""),
f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?"
)
if not isinstance(vocab, Vocab):
return # model has no vocab
# Check for a vocab size mismatch
if params.n_vocab == vocab.vocab_size:
@@ -1037,11 +960,11 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False)
if vocab.vocab_size < params.n_vocab:
msg += " Add the --pad-vocab option and try again."
raise ValueError(msg)
raise Exception(msg)
class OutputFile:
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
def add_meta_arch(self, params: Params) -> None:
@@ -1054,7 +977,6 @@ class OutputFile:
name = str(params.path_model.parent).split('/')[-1]
self.gguf.add_name (name)
self.gguf.add_vocab_size (params.n_vocab)
self.gguf.add_context_length (params.n_ctx)
self.gguf.add_embedding_length (params.n_embd)
self.gguf.add_block_count (params.n_layer)
@@ -1091,6 +1013,20 @@ class OutputFile:
if params.ftype is not None:
self.gguf.add_file_type(params.ftype)
def handle_tokenizer_model(self, vocab: Vocab) -> str:
# Map the vocab types to the supported tokenizer models
tokenizer_model = {
SentencePieceVocab: "llama",
HfVocab: "llama",
BpeVocab: "gpt2",
}.get(type(vocab))
# Block if vocab type is not predefined
if tokenizer_model is None:
raise ValueError("Unknown vocab type: Not supported")
return tokenizer_model
def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
tokens = []
scores = []
@@ -1107,8 +1043,11 @@ class OutputFile:
return tokens, scores, toktypes
def add_meta_vocab(self, vocab: Vocab) -> None:
# Handle the tokenizer model
tokenizer_model = self.handle_tokenizer_model(vocab)
# Ensure that tokenizer_model is added to the GGUF model
self.gguf.add_tokenizer_model(vocab.tokenizer_model)
self.gguf.add_tokenizer_model(tokenizer_model)
# Extract model vocabulary for model conversion
tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
@@ -1135,26 +1074,6 @@ class OutputFile:
def write_tensor_info(self) -> None:
self.gguf.write_ti_data_to_file()
def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
if ftype == GGMLFileType.MostlyQ8_0:
ndarrays = bounded_parallel_map(
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
use_processpool_executor=True,
)
else:
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
start = time.time()
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
elapsed = time.time() - start
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model)))
print(
f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
)
self.gguf.write_tensor_data(ndarray)
def close(self) -> None:
self.gguf.close()
@@ -1163,7 +1082,7 @@ class OutputFile:
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
of = OutputFile(fname_out, endianess=endianess)
@@ -1191,7 +1110,7 @@ class OutputFile:
@staticmethod
def write_all(
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
) -> None:
@@ -1201,11 +1120,8 @@ class OutputFile:
# meta data
of.add_meta_arch(params)
if isinstance(vocab, Vocab):
of.add_meta_vocab(vocab)
of.add_meta_special_vocab(svocab)
else: # NoVocab
of.gguf.add_tokenizer_model(vocab.tokenizer_model)
of.add_meta_vocab(vocab)
of.add_meta_special_vocab(svocab)
# tensor info
for name, lazy_tensor in model.items():
@@ -1215,7 +1131,24 @@ class OutputFile:
of.write_tensor_info()
# tensor data
of.write_tensor_data(ftype, model, concurrency)
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
if ftype == GGMLFileType.MostlyQ8_0:
ndarrays = bounded_parallel_map(
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
use_processpool_executor=True,
)
else:
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
start = time.time()
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
elapsed = time.time() - start
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model)))
print(
f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
)
of.gguf.write_tensor_data(ndarray)
of.close()
@@ -1223,16 +1156,16 @@ class OutputFile:
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
return GGMLFileType.AllF32
if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
return GGMLFileType.MostlyF16
if output_type_str == "q8_0":
return GGMLFileType.MostlyQ8_0
name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
raise ValueError(f"Unexpected combination of types: {name_to_type}")
raise Exception(f"Unexpected combination of types: {name_to_type}")
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
@@ -1242,7 +1175,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
tmp = model
@@ -1269,7 +1202,8 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
if skip_unknown:
print(f"Unexpected tensor name: {name} - skipping")
continue
raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
else:
raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
if tensor_type in should_skip:
print(f"skipping tensor {name_new}")
@@ -1286,7 +1220,7 @@ def nth_multifile_path(path: Path, n: int) -> Path | None:
the nth path in the model.
'''
# Support the following patterns:
patterns = [
patterns: list[tuple[str, str]] = [
# - x.00.pth, x.01.pth, etc.
(r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
# - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
@@ -1332,9 +1266,9 @@ def load_some_model(path: Path) -> ModelPlus:
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
files = [file for glob in globs for file in path.glob(glob)]
if not files:
raise FileNotFoundError(f"Can't find model in directory {path}")
raise Exception(f"Can't find model in directory {path}")
if len(files) > 1:
raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}")
raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
path = files[0]
paths = find_multifile_paths(path)
@@ -1348,14 +1282,39 @@ def load_some_model(path: Path) -> ModelPlus:
class VocabFactory:
_VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
def __init__(self, path: Path):
self.path = path
self.files: dict[str, Path | None] = {
"tokenizer.model": None,
"vocab.json": None,
"tokenizer.json": None,
}
self._detect_files()
def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
load_merges = vocab.name == "bpe"
n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None
def _detect_files(self):
for file in self.files.keys():
file_path = self.path / file
parent_file_path = self.path.parent / file
if file_path.exists():
self.files[file] = file_path
elif parent_file_path.exists():
self.files[file] = parent_file_path
print(f"Found vocab files: {self.files}")
def _select_file(self, vocabtype: str | None) -> Path:
if vocabtype in ["spm", "bpe"]:
for file_key in self.files.keys():
if (file := self.files[file_key]) is not None:
return file
raise FileNotFoundError(f"{vocabtype} vocab not found.")
if vocabtype == "hfft":
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
return self.path
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
load_merges = vocabtype == "bpe"
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
return gguf.SpecialVocab(
model_parent_path,
load_merges=load_merges,
@@ -1363,36 +1322,30 @@ class VocabFactory:
n_vocab=n_vocab,
)
def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
selected_vocabs: dict[str, type[Vocab]] = {}
for vtype in vocab_types:
try:
selected_vocabs[vtype] = vocab_classes[vtype]
except KeyError:
raise ValueError(f"Unsupported vocabulary type {vtype}") from None
def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
path = self._select_file(vocabtype)
print(f"Loading vocab file '{path}', type '{vocabtype}'")
for vtype, cls in selected_vocabs.items():
try:
vocab = cls(self.path)
break
except FileNotFoundError:
pass # ignore unavailable tokenizers
added_tokens_path = path.parent / "added_tokens.json"
vocab: Vocab
if vocabtype == "bpe":
vocab = BpeVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
elif vocabtype == "spm":
vocab = SentencePieceVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
elif vocabtype == "hfft":
vocab = HfVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
else:
raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
return vocab
def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
vocab: BaseVocab
if vocab_types is None:
vocab = NoVocab()
else:
vocab = self._create_vocab_by_path(vocab_types)
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
# FIXME: Respect --vocab-dir?
special_vocab = self._create_special_vocab(
vocab,
vocabtype,
model_parent_path,
)
return vocab, special_vocab
@@ -1426,14 +1379,15 @@ def main(args_in: list[str] | None = None) -> None:
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
# We currently only support Q8_0 output on little endian systems.
output_choices.append("q8_0")
parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
vocab_types = ["spm", "bpe", "hfft"]
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab")
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
@@ -1443,8 +1397,18 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
args = parser.parse_args(args_in)
if args.no_vocab and args.vocab_only:
raise ValueError("--vocab-only does not make sense with --no-vocab")
if args.awq_path:
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
tmp_model_path = args.model / "weighted_model"
if tmp_model_path.is_dir():
print(f"{tmp_model_path} exists as a weighted model.")
else:
tmp_model_path.mkdir(parents=True, exist_ok=True)
print("Saving new weighted model ...")
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
print(f"Saved weighted model at {tmp_model_path}.")
args.model = tmp_model_path
if args.dump_single:
model_plus = lazy_load_file(args.model)
@@ -1466,12 +1430,10 @@ def main(args_in: list[str] | None = None) -> None:
params = Params.load(model_plus)
if params.n_ctx == -1:
if args.ctx is None:
msg = """\
The model doesn't have a context size, and you didn't specify one with --ctx
Please specify one with --ctx:
- LLaMA v1: --ctx 2048
- LLaMA v2: --ctx 4096"""
parser.error(textwrap.dedent(msg))
raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
"Please specify one with --ctx:\n"
" - LLaMA v1: --ctx 2048\n"
" - LLaMA v2: --ctx 4096\n")
params.n_ctx = args.ctx
if args.outtype:
@@ -1486,11 +1448,9 @@ def main(args_in: list[str] | None = None) -> None:
model_parent_path = model_plus.paths[0].parent
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
vocab_factory = VocabFactory(vocab_path)
vocab_types = None if args.no_vocab else args.vocab_type.split(",")
vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path)
if args.vocab_only:
assert isinstance(vocab, Vocab)
if not args.outfile:
raise ValueError("need --outfile if using --vocab-only")
outfile = args.outfile
@@ -1499,7 +1459,7 @@ def main(args_in: list[str] | None = None) -> None:
print(f"Wrote {outfile}")
return
if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
if model_plus.vocab is not None and args.vocab_dir is None:
vocab = model_plus.vocab
print(f"Vocab info: {vocab}")

View File

@@ -1,7 +1,7 @@
# Token generation performance troubleshooting
## Verifying that the model is running on the GPU with CUDA
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
## Verifying that the model is running on the GPU with cuBLAS
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
```shell
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
```

View File

@@ -20,8 +20,6 @@ else()
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
add_subdirectory(finetune)
add_subdirectory(gritlm)
add_subdirectory(gguf-split)
add_subdirectory(infill)
add_subdirectory(llama-bench)
add_subdirectory(llava)
@@ -34,7 +32,6 @@ else()
add_subdirectory(perplexity)
add_subdirectory(quantize)
add_subdirectory(quantize-stats)
add_subdirectory(retrieval)
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(passkey)

View File

@@ -1015,9 +1015,9 @@ static struct ggml_tensor * forward_lora(
struct ggml_tensor * kc = kv_self.k;
struct ggml_tensor * vc = kv_self.v;
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, N);
{
int * data = (int *) KQ_pos->data;
float * data = (float *) KQ_pos->data;
for (int i = 0; i < N; ++i) {
data[i] = n_past + i;
}
@@ -1547,7 +1547,7 @@ int main(int argc, char ** argv) {
float error_before_opt = ggml_get_f32_1d(e, 0);
struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS);
struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
opt_params_lbfgs.print_forward_graph = false;
opt_params_lbfgs.print_backward_graph = false;
opt_params_lbfgs.lbfgs.n_iter = 16;

View File

@@ -32,15 +32,16 @@ int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
printf(" example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
return 1 ;
}
int n_kv_max = 2048;
int is_pp_shared = 0;
int n_gpu_layers = 0;
int mmq = 0;
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
std::vector<int> n_tg = { 128, 256, };
@@ -64,15 +65,19 @@ int main(int argc, char ** argv) {
}
if (argc >= 6) {
n_pp = parse_list(argv[5]);
mmq = std::atoi(argv[5]);
}
if (argc >= 7) {
n_tg = parse_list(argv[6]);
n_pp = parse_list(argv[6]);
}
if (argc >= 8) {
n_pl = parse_list(argv[7]);
n_tg = parse_list(argv[7]);
}
if (argc >= 9) {
n_pl = parse_list(argv[8]);
}
// init LLM
@@ -101,13 +106,11 @@ int main(int argc, char ** argv) {
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max;
ctx_params.n_batch = 512;
ctx_params.mul_mat_q = mmq;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
// ensure enough sequences are available
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (ctx == NULL) {
@@ -138,8 +141,6 @@ int main(int argc, char ** argv) {
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false;
}
llama_synchronize(ctx);
}
return true;
@@ -158,7 +159,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
@@ -179,10 +180,10 @@ int main(int argc, char ** argv) {
llama_batch_clear(batch);
for (int i = 0; i < pp; ++i) {
for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
llama_batch_add(batch, 0, i, { j }, false);
}
const int n_tokens = is_pp_shared ? pp : pl*pp;
for (int i = 0; i < n_tokens; ++i) {
llama_batch_add(batch, 0, i, { 0 }, false);
}
batch.logits[batch.n_tokens - 1] = true;
@@ -197,7 +198,7 @@ int main(int argc, char ** argv) {
if (is_pp_shared) {
for (int32_t i = 1; i < pl; ++i) {
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
}
}

View File

@@ -79,7 +79,7 @@ batch.n_tokens = Int32(tokens.count)
for (i, token) in tokens.enumerated() {
batch.token[i] = token
batch.pos[i] = Int32(i)
batch.pos[i] = llama_pos(i)
batch.n_seq_id[i] = 1
// batch.seq_id[i][0] = 0
// TODO: is this the proper way to do this?
@@ -98,7 +98,7 @@ if llama_decode(context, batch) != 0 {
}
for i in 1 ..< n_parallel {
llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
llama_kv_cache_seq_cp(context, 0, Int32(i), 0, llama_pos(batch.n_tokens))
}
if n_parallel > 1 {
@@ -125,8 +125,8 @@ while n_cur <= n_len {
continue
}
var n_vocab = llama_n_vocab(model)
var logits = llama_get_logits_ith(context, i_batch[i])
let n_vocab = llama_n_vocab(model)
let logits = llama_get_logits_ith(context, i_batch[i])
var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
@@ -173,7 +173,7 @@ while n_cur <= n_len {
// push this new token for next evaluation
batch.token[Int(batch.n_tokens)] = new_token_id
batch.pos[Int(batch.n_tokens)] = n_cur
batch.pos[Int(batch.n_tokens)] = llama_pos(n_cur)
batch.n_seq_id[Int(batch.n_tokens)] = 1
if let seq_id = batch.seq_id[Int(batch.n_tokens)] {
seq_id[0] = Int32(i)

View File

@@ -48,8 +48,6 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is";
}
process_escapes(params.prompt);
// init LLM
llama_backend_init();
@@ -80,9 +78,8 @@ int main(int argc, char ** argv) {
llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_req;
ctx_params.n_ctx = n_kv_req;
ctx_params.n_batch = std::max(n_len, n_parallel);
ctx_params.n_seq_max = n_parallel;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -135,7 +132,7 @@ int main(int argc, char ** argv) {
// assign the system KV cache to all parallel sequences
// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
for (int32_t i = 1; i < n_parallel; ++i) {
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens);
}
if (n_parallel > 1) {

View File

@@ -189,10 +189,12 @@ int main(int argc, char ** argv) {
int32_t nelements = sizex*sizey;
std::vector<int64_t> hist_cur(1 << 4, 0);
// Set up a the benchmark matrices
// printf("Creating new tensor q11 & Running quantize\n");
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr);
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
// Set up a the compute graph
// printf("Creating new tensor q31\n");
@@ -205,7 +207,7 @@ int main(int argc, char ** argv) {
// Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n");
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr);
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
// printf("Creating new tensor q32\n");
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);

View File

@@ -21,8 +21,6 @@ An example command using a model from [karpathy/tinyllamas](https://huggingface.
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
Now you can use the model with a command like:
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`

View File

@@ -1,7 +1,6 @@
#include "ggml.h"
#include "llama.h"
#include "common.h"
#include "log.h"
#include <unordered_map>
#include <vector>
@@ -79,101 +78,111 @@ typedef struct {
struct TransformerWeights {
// token embedding table
std::vector<float> token_embedding_table; // (vocab_size, dim)
float* token_embedding_table; // (vocab_size, dim)
// weights for rmsnorms
std::vector<float> rms_att_weight; // (layer, dim) rmsnorm weights
std::vector<float> rms_ffn_weight; // (layer, dim)
float* rms_att_weight; // (layer, dim) rmsnorm weights
float* rms_ffn_weight; // (layer, dim)
// weights for matmuls
std::vector<float> wq; // (layer, dim, dim)
std::vector<float> wk; // (layer, dim, dim)
std::vector<float> wv; // (layer, dim, dim)
std::vector<float> wo; // (layer, dim, dim)
float* wq; // (layer, dim, dim)
float* wk; // (layer, dim, dim)
float* wv; // (layer, dim, dim)
float* wo; // (layer, dim, dim)
// weights for ffn
std::vector<float> w1; // (layer, hidden_dim, dim)
std::vector<float> w2; // (layer, dim, hidden_dim)
std::vector<float> w3; // (layer, hidden_dim, dim)
float* w1; // (layer, hidden_dim, dim)
float* w2; // (layer, dim, hidden_dim)
float* w3; // (layer, hidden_dim, dim)
// final rmsnorm
std::vector<float> rms_final_weight; // (dim,)
float* rms_final_weight; // (dim,)
// freq_cis for RoPE relatively positional embeddings
// std::vector<float> freq_cis_real; // (seq_len, dim/2)
// std::vector<float> freq_cis_imag; // (seq_len, dim/2)
// float* freq_cis_real; // (seq_len, dim/2)
// float* freq_cis_imag; // (seq_len, dim/2)
// (optional) classifier weights for the logits, on the last layer
std::vector<float> wcls;
float* wcls;
~TransformerWeights() {
delete[] token_embedding_table;
delete[] rms_att_weight;
delete[] rms_ffn_weight;
delete[] wq;
delete[] wk;
delete[] wv;
delete[] wo;
delete[] w1;
delete[] w2;
delete[] w3;
delete[] rms_final_weight;
delete[] wcls;
}
};
static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) {
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
try {
w->token_embedding_table.resize(p->vocab_size * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
// we calloc instead of malloc to keep valgrind happy
w->token_embedding_table = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
w->rms_att_weight.resize(p->n_layers * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
w->rms_att_weight = new float[p->n_layers * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
w->rms_ffn_weight.resize(p->n_layers * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
w->rms_ffn_weight = new float[p->n_layers * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
w->wq.resize(p->n_layers * p->dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wq = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wk = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wv = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wo.resize(p->n_layers * p->dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wo = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->rms_final_weight.resize(p->dim);
LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
w->rms_final_weight = new float[p->dim]();
printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
if (shared_weights) {
w->wcls = {};
} else {
w->wcls.resize(p->vocab_size * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
}
}
catch (std::length_error &) {
die("Invalid configuration. Failed to allocate memory for weights");
if (shared_weights) {
w->wcls = NULL;
} else {
w->wcls = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
}
}
static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) {
if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1;
if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1;
if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1;
if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1;
if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1;
if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1;
if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1;
if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1;
if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1;
if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1;
if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1;
static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
// Skip freq_cis_real & freq_cis_imag
int head_size = p->dim / p->n_heads;
fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1;
if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
// Check we didn't forget to read anything
auto curr = ftell(f);
fseek(f, 0, SEEK_END);
auto end = ftell(f);
if (curr != end) {
LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end);
return 1;
}
@@ -181,20 +190,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
}
static void print_sample_weights(TransformerWeights *w){
LOG("----- Quick print of first of the weight vales of all the variables\n");
LOG("%f\n", w->token_embedding_table[0]);
LOG("%f\n", w->rms_att_weight[0]);
LOG("%f\n", w->rms_ffn_weight[0]);
printf("----- Quick print of first of the weight vales of all the variables\n");
printf("%f\n", w->token_embedding_table[0]);
printf("%f\n", w->rms_att_weight[0]);
printf("%f\n", w->rms_ffn_weight[0]);
LOG("%f\n", w->wq[0]);
LOG("%f\n", w->wk[0]);
LOG("%f\n", w->wv[0]);
LOG("%f\n", w->wo[0]);
LOG("%f\n", w->w1[0]);
LOG("%f\n", w->w2[0]);
LOG("%f\n", w->w3[0]);
LOG("%f\n", w->rms_att_weight[0]);
if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
printf("%f\n", w->wq[0]);
printf("%f\n", w->wk[0]);
printf("%f\n", w->wv[0]);
printf("%f\n", w->wo[0]);
printf("%f\n", w->w1[0]);
printf("%f\n", w->w2[0]);
printf("%f\n", w->w3[0]);
printf("%f\n", w->rms_att_weight[0]);
if (w->wcls) printf("%f\n", w->wcls[0]);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -216,16 +225,14 @@ struct llama_vocab {
};
struct my_llama_hparams {
uint32_t n_vocab = 32000;
uint32_t n_ctx = 512; // this is provided as user input?
uint32_t n_embd = 4096;
uint32_t n_ff = 11008;
uint32_t n_mult = 4;
uint32_t n_head = 32;
uint32_t n_head_kv = 32;
uint32_t n_layer = 32;
uint32_t n_rot = 64;
uint32_t n_vocab = 32000;
uint32_t n_ctx = 512; // this is provided as user input?
uint32_t n_embd = 4096;
uint32_t n_ff = 11008;
uint32_t n_mult = 4;
uint32_t n_head = 32;
uint32_t n_layer = 32;
uint32_t n_rot = 64;
bool operator!=(const my_llama_hparams& other) const {
return memcmp(this, &other, sizeof(my_llama_hparams));
}
@@ -318,30 +325,14 @@ struct train_params {
};
static void print_params(struct my_llama_hparams * params) {
LOG("%s: n_vocab: %u\n", __func__, params->n_vocab);
LOG("%s: n_ctx: %u\n", __func__, params->n_ctx);
LOG("%s: n_embd: %u\n", __func__, params->n_embd);
LOG("%s: n_mult: %u\n", __func__, params->n_mult);
LOG("%s: n_head: %u\n", __func__, params->n_head);
LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
LOG("%s: n_ff: %u\n", __func__, params->n_ff);
LOG("%s: n_layer: %u\n", __func__, params->n_layer);
LOG("%s: n_rot: %u\n", __func__, params->n_rot);
}
static void print_tensor_info(const struct ggml_context * ctx) {
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
LOG("%s: Allocating ", __func__);
int64_t total = 1;
int i = 0;
for (; i < ggml_n_dims(t); ++i) {
if (i > 0) LOG("x ");
LOG("[%" PRId64 "] ", t->ne[i]);
total *= t->ne[i];
}
if (i > 1) LOG("= [%" PRId64 "] ", total);
LOG("float space for %s\n", ggml_get_name(t));
}
printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
printf("%s: n_embd: %u\n", __func__, params->n_embd);
printf("%s: n_mult: %u\n", __func__, params->n_mult);
printf("%s: n_head: %u\n", __func__, params->n_head);
printf("%s: n_ff: %u\n", __func__, params->n_ff);
printf("%s: n_layer: %u\n", __func__, params->n_layer);
printf("%s: n_rot: %u\n", __func__, params->n_rot);
}
static void init_model(struct my_llama_model * model) {
@@ -351,8 +342,6 @@ static void init_model(struct my_llama_model * model) {
const uint32_t n_layer = hparams.n_layer;
const uint32_t n_vocab = hparams.n_vocab;
const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv;
const uint32_t n_ff = hparams.n_ff;
struct ggml_context * ctx = model->ctx;
@@ -361,8 +350,25 @@ static void init_model(struct my_llama_model * model) {
model->train_tokens = 0;
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
// printing the per-layer allocations here so we dont print in the for loop.
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
ggml_set_name(model->norm, "norm.weight");
@@ -377,8 +383,8 @@ static void init_model(struct my_llama_model * model) {
layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
@@ -400,8 +406,6 @@ static void init_model(struct my_llama_model * model) {
ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
}
print_tensor_info(ctx);
}
static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
@@ -417,9 +421,9 @@ static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
static void print_row(struct ggml_tensor * probs, int i) {
for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i);
LOG(" %f", p);
printf(" %f", p);
}
LOG("\n");
printf("\n");
}
static void print_matrix(struct ggml_tensor * probs) {
@@ -427,12 +431,33 @@ static void print_matrix(struct ggml_tensor * probs) {
for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i);
LOG(" %.2f", p);
printf(" %.2f", p);
}
LOG("\n");
printf("\n");
}
}
#ifdef __GNUC__
#ifdef __MINGW32__
__attribute__((format(gnu_printf, 1, 2)))
#else
__attribute__((format(printf, 1, 2)))
#endif
#endif
static std::string format(const char * fmt, ...) {
va_list ap, ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX);
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
@@ -524,9 +549,8 @@ static std::string llama_escape_whitespaces(const std::string & text) {
return out.str();
}
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
if (is_ggml_file(filename)) {
LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
@@ -554,9 +578,6 @@ static void load_vocab(const char * filename, const Config * config, struct llam
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
if (n_vocab != static_cast<uint32_t>(config->vocab_size)) {
die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size);
}
vocab->id_to_token.resize(n_vocab);
@@ -574,7 +595,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
gguf_free(ctx);
} else {
// assume llama2.c vocabulary
LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
llama_file file(filename, "rb");
if (!file.fp) {
die_fmt("%s: %s", strerror(errno), filename);
@@ -617,15 +638,38 @@ static void load_vocab(const char * filename, const Config * config, struct llam
}
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
int size = 1;
for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) {
size *= gg_weights->ne[dim];
}
for (int ct = 0; ct < size; ++ct) {
int64_t i0 = 0; int64_t i1 = 0;
int64_t i2 = 0; int64_t i3 = 0;
ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3);
ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]);
int ct;
switch (ggml_n_dims(gg_weights)) {
case 1:
ct = 0;
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
*ptr = karpathy_weights[ct];
ct++;
}
break;
case 2:
ct = 0;
for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
*ptr = karpathy_weights[ct];
ct++;
}
}
break;
case 3:
ct = 0;
for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
*ptr = karpathy_weights[ct];
ct++;
}
}
}
break;
}
}
@@ -635,18 +679,16 @@ static void save_as_llama_model(
// convert AK weights into GG weights one by one.
// w->token_embedding_table -> model->tok_embeddings
// float* -> struct ggml_tensor
convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data());
convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data());
convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data());
convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
//print_row(model->norm, 0);
// for rms-att-weight
int row_length = model->hparams.n_embd;
int n_ff = model->hparams.n_ff;
const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv;
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
auto & layer = model->layers[i];
// 1d
@@ -655,10 +697,9 @@ static void save_as_llama_model(
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]);
// from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries
convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length/n_multiqueries]);
convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length/n_multiqueries]);
convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
@@ -695,8 +736,8 @@ static void save_as_llama_model(
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv);
// n_head_kv is optional, default to n_head
// gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
@@ -748,12 +789,12 @@ static void save_as_llama_model(
static struct train_params get_default_train_params() {
struct train_params params;
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
params.fn_llama2c_output_model = "ak_llama_model.bin";
params.fn_train_data = "shakespeare.txt";
params.fn_checkpoint_in = "checkpoint.bin";
params.fn_checkpoint_out = "checkpoint.bin";
params.fn_model_out = "ggml-checkpoint-f32.bin";
params.fn_train_data = "shakespeare.txt";
params.fn_checkpoint_in = "checkpoint.bin";
params.fn_checkpoint_out = "checkpoint.bin";
params.fn_model_out = "ggml-checkpoint-f32.bin";
params.seed = -1;
@@ -788,8 +829,8 @@ static struct train_params get_default_train_params() {
params.adam_alpha = 1e-3f;
params.adam_decay = 1e-3f;
params.mem_model_gb = 2;
params.mem_compute_gb = 24;
params.mem_model_gb = 2;
params.mem_compute_gb = 24;
params.mem_compute0_gb = 8;
params.mem_compute1_gb = 2;
@@ -875,30 +916,19 @@ int main(int argc, char ** argv) {
if (!params_parse(argc, argv, &params)) {
return 1;
}
log_set_target(stdout);
Config config;
TransformerWeights weights = {};
{
LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
FILE * file = fopen(params.fn_llama2c_model, "rb");
if (!file) {
LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
return 1;
}
FILE *file = fopen(params.fn_llama2c_model, "rb");
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
// read in the config header
if (fread(&config, sizeof(Config), 1, file) != 1) {
LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
return 1;
}
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
auto shared_weights = config.vocab_size > 0;
config.vocab_size = abs(config.vocab_size);
// read in the Transformer weights
alloc_weights(&weights, &config, shared_weights);
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
return 1;
}
malloc_weights(&weights, &config, shared_weights);
if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
fclose(file);
}
@@ -906,18 +936,15 @@ int main(int argc, char ** argv) {
load_vocab(params.fn_vocab_model, &config, &vocab);
struct my_llama_model model;
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
model.hparams.n_ctx = params.n_ctx;
model.hparams.n_embd = config.dim; //params.n_embd;
model.hparams.n_ff = config.hidden_dim;
model.hparams.n_mult = 32;//params.n_mult;
model.hparams.n_head = config.n_heads; //params.n_head;
model.hparams.n_head_kv = config.n_kv_heads;
model.hparams.n_layer = config.n_layers; //params.n_layer;
model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
model.hparams.n_ctx = params.n_ctx;
model.hparams.n_embd = config.dim; //params.n_embd;
model.hparams.n_ff = config.hidden_dim;
model.hparams.n_mult = 32;//params.n_mult;
model.hparams.n_head = config.n_heads; //params.n_head;
model.hparams.n_layer = config.n_layers; //params.n_layer;
model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
print_params(&model.hparams);
struct ggml_init_params lcparams;
lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
lcparams.mem_buffer = NULL;
@@ -929,7 +956,7 @@ int main(int argc, char ** argv) {
model.name = basename(params.fn_llama2c_model);
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
ggml_free(model.ctx);
return 0;

View File

@@ -19,7 +19,18 @@ static std::vector<std::string> split_lines(const std::string & s) {
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
for (size_t i = 0; i < tokens.size(); i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
llama_batch_add(batch, tokens[i], i, { seq_id }, false);
}
}
static void normalize(float * vec, float * out, int n) {
float norm = 0;
for (int i = 0; i < n; i++) {
norm += vec[i] * vec[i];
}
norm = sqrt(norm);
for (int i = 0; i < n; i++) {
out[i] = vec[i] / norm;
}
}
@@ -33,23 +44,11 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
fprintf(stderr, "%s : failed to decode\n", __func__);
}
for (int i = 0; i < batch.n_tokens; i++) {
if (!batch.logits[i]) {
continue;
}
// try to get sequence embeddings - supported only when pooling_type is not NONE
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
if (embd == NULL) {
embd = llama_get_embeddings_ith(ctx, i);
if (embd == NULL) {
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
continue;
}
}
float * out = output + batch.seq_id[i][0] * n_embd;
llama_embd_normalize(embd, out, n_embd);
// normalize on copy
for (int k = 0; k < n_seq; k++) {
float * emb = llama_get_embeddings_ith(ctx, k);
float * out = output + k * n_embd;
normalize(emb, out, n_embd);
}
}
@@ -61,8 +60,6 @@ int main(int argc, char ** argv) {
}
params.embedding = true;
// For non-causal models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch;
print_build_info();
@@ -109,27 +106,18 @@ int main(int argc, char ** argv) {
// max batch size
const uint64_t n_batch = params.n_batch;
GGML_ASSERT(params.n_batch >= params.n_ctx);
GGML_ASSERT(params.n_batch == params.n_ctx);
// tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs;
for (const auto & prompt : prompts) {
auto inp = ::llama_tokenize(ctx, prompt, true, false);
auto inp = ::llama_tokenize(ctx, prompt, true);
if (inp.size() > n_batch) {
fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch);
return 1;
inp.resize(n_batch);
}
inputs.push_back(inp);
}
// add eos if not present
for (auto & inp : inputs) {
if (inp.empty() || inp.back() != llama_token_eos(model)) {
inp.push_back(llama_token_eos(model));
}
}
// tokenization stats
if (params.verbose_prompt) {
for (int i = 0; i < (int) inputs.size(); i++) {
@@ -144,7 +132,7 @@ int main(int argc, char ** argv) {
// initialize batch
const int n_prompts = prompts.size();
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts);
// allocate output
const int n_embd = llama_n_embd(model);
@@ -157,7 +145,6 @@ int main(int argc, char ** argv) {
for (int k = 0; k < n_prompts; k++) {
// clamp to n_batch tokens
auto & inp = inputs[k];
const uint64_t n_toks = inp.size();
// encode if at capacity
@@ -178,28 +165,15 @@ int main(int argc, char ** argv) {
float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd);
// print the first part of the embeddings or for a single prompt, the full embedding
fprintf(stdout, "\n");
for (int j = 0; j < n_prompts; j++) {
fprintf(stdout, "embedding %d: ", j);
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
}
fprintf(stdout, "\n");
}
// print cosine similarity matrix
if (n_prompts > 1) {
fprintf(stdout, "\n");
printf("cosine similarity matrix:\n\n");
for (int i = 0; i < n_prompts; i++) {
for (int j = 0; j < n_prompts; j++) {
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f ", sim);
}
fprintf(stdout, "\n");
// print first 3 embeddings
for (int j = 0; j < std::min(3, n_prompts); j++) {
fprintf(stderr, "embedding %d: ", j);
for (int i = 0; i < n_embd; i++) {
fprintf(stderr, "%f ", emb[j * n_embd + i]);
}
fprintf(stderr, "\n\n");
}
fprintf(stderr, "\n");
// clean up
llama_print_timings(ctx);

View File

@@ -554,7 +554,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
};
// KQ_pos - contains the positions
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, N);
ggml_set_input(KQ_pos);
// rope has so much parameters that we make a custom function for it
@@ -743,7 +743,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
// set KQ_pos
{
int * data = (int *) KQ_pos->data;
float * data = (float *) KQ_pos->data;
for (int i = 0; i < N; ++i) {
data[i] = n_past + i;
}
@@ -1531,7 +1531,7 @@ int main(int argc, char ** argv) {
lora.hparams.n_rank_output = n_rank_output;
// set opt params from command line
opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
opt->params.print_forward_graph = false;
opt->params.print_backward_graph = false;
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;

View File

@@ -1,5 +0,0 @@
set(TARGET gguf-split)
add_executable(${TARGET} gguf-split.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,9 +0,0 @@
## GGUF split Example
CLI to split / merge GGUF files.
**Command line options:**
- `--split`: split GGUF to multiple GGUF, default operation.
- `--split-max-tensors`: maximum tensors in each split: default(128)
- `--merge`: merge multiple GGUF to a single GGUF.

View File

@@ -1,468 +0,0 @@
#include "llama.h"
#include "common.h"
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <fstream>
#include <string>
#include <vector>
#include <stdio.h>
#include <string.h>
#include <climits>
#include <stdexcept>
#if defined(_WIN32)
#include <windows.h>
#ifndef PATH_MAX
#define PATH_MAX MAX_PATH
#endif
#include <io.h>
#endif
enum split_operation : uint8_t {
SPLIT_OP_SPLIT,
SPLIT_OP_MERGE,
};
struct split_params {
split_operation operation = SPLIT_OP_SPLIT;
int n_split_tensors = 128;
std::string input;
std::string output;
};
static void split_print_usage(const char * executable) {
const split_params default_params;
printf("\n");
printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable);
printf("\n");
printf("Apply a GGUF operation on IN to OUT.");
printf("\n");
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" --version show version and build info\n");
printf(" --split split GGUF to multiple GGUF (default)\n");
printf(" --split-max-tensors max tensors in each split: default(%d)\n", default_params.n_split_tensors);
printf(" --merge merge multiple GGUF to a single GGUF\n");
printf("\n");
}
static bool split_params_parse_ex(int argc, const char ** argv, split_params & params) {
std::string arg;
const std::string arg_prefix = "--";
bool invalid_param = false;
int arg_idx = 1;
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
arg = argv[arg_idx];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
bool arg_found = false;
if (arg == "-h" || arg == "--help") {
split_print_usage(argv[0]);
exit(0);
}
if (arg == "--version") {
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
exit(0);
}
if (arg == "--merge") {
arg_found = true;
params.operation = SPLIT_OP_MERGE;
}
if (arg == "--split") {
arg_found = true;
params.operation = SPLIT_OP_SPLIT;
}
if (arg == "--split-max-tensors") {
if (++arg_idx >= argc) {
invalid_param = true;
break;
}
arg_found = true;
params.n_split_tensors = atoi(argv[arg_idx]);
}
if (!arg_found) {
throw std::invalid_argument("error: unknown argument: " + arg);
}
}
if (invalid_param) {
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
}
if (argc - arg_idx < 2) {
printf("%s: bad arguments\n", argv[0]);
split_print_usage(argv[0]);
return false;
}
params.input = argv[arg_idx++];
params.output = argv[arg_idx++];
return true;
}
static bool split_params_parse(int argc, const char ** argv, split_params & params) {
bool result = true;
try {
if (!split_params_parse_ex(argc, argv, params)) {
split_print_usage(argv[0]);
exit(EXIT_FAILURE);
}
}
catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what());
split_print_usage(argv[0]);
exit(EXIT_FAILURE);
}
return result;
}
static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
for (size_t i = 0; i < n; ++i) {
file.write(&zero, 1);
}
}
struct split_strategy {
const split_params params;
std::ifstream & f_input;
struct gguf_context * ctx_gguf;
struct ggml_context * ctx_meta = NULL;
const int n_tensors;
const int n_split;
int i_split = 0;
int i_tensor = 0;
std::vector<uint8_t> read_data;
struct gguf_context * ctx_out;
std::ofstream fout;
split_strategy(const split_params & params,
std::ifstream & f_input,
struct gguf_context * ctx_gguf,
struct ggml_context * ctx_meta) :
params(params),
f_input(f_input),
ctx_gguf(ctx_gguf),
ctx_meta(ctx_meta),
n_tensors(gguf_get_n_tensors(ctx_gguf)),
n_split(std::ceil(1. * n_tensors / params.n_split_tensors)) {
}
bool should_split() const {
return i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
}
void split_start() {
ctx_out = gguf_init_empty();
// Save all metadata in first split only
if (i_split == 0) {
gguf_set_kv(ctx_out, ctx_gguf);
}
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
// populate the original tensors, so we get an initial metadata
for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
gguf_add_tensor(ctx_out, meta);
}
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
fprintf(stderr, "%s: %s ...", __func__, split_path);
fout = std::ofstream(split_path, std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
auto meta_size = gguf_get_meta_size(ctx_out);
// placeholder for the meta data
::zeros(fout, meta_size);
i_split++;
}
void next_tensor() {
const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
auto n_bytes = ggml_nbytes(t);
if (read_data.size() < n_bytes) {
read_data.resize(n_bytes);
}
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
f_input.seekg(offset);
f_input.read((char *)read_data.data(), n_bytes);
t->data = read_data.data();
// write tensor data + padding
fout.write((const char *)t->data, n_bytes);
zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
i_tensor++;
}
void split_end() {
// go back to beginning of file and write the updated metadata
fout.seekp(0);
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
gguf_get_meta_data(ctx_out, data.data());
fout.write((const char *)data.data(), data.size());
fout.close();
gguf_free(ctx_out);
fprintf(stderr, "\033[3Ddone\n");
}
};
static void gguf_split(const split_params & split_params) {
struct ggml_context * ctx_meta = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
};
std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
if (!f_input.is_open()) {
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
exit(EXIT_FAILURE);
}
auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
if (!ctx_gguf) {
fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
exit(EXIT_FAILURE);
}
split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
char first_split_path[PATH_MAX] = {0};
llama_split_path(first_split_path, sizeof(first_split_path),
split_params.output.c_str(), strategy.i_split, strategy.n_split);
fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
__func__, split_params.input.c_str(),
first_split_path,
split_params.n_split_tensors);
strategy.split_start();
while (strategy.i_tensor < strategy.n_tensors) {
strategy.next_tensor();
if (strategy.should_split()) {
strategy.split_end();
strategy.split_start();
}
}
strategy.split_end();
gguf_free(ctx_gguf);
f_input.close();
fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n",
__func__, strategy.n_split, strategy.n_tensors);
}
static void gguf_merge(const split_params & split_params) {
fprintf(stderr, "%s: %s -> %s\n",
__func__, split_params.input.c_str(),
split_params.output.c_str());
int n_split = 1;
int total_tensors = 0;
auto * ctx_out = gguf_init_empty();
std::ofstream fout(split_params.output.c_str(), std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
std::vector<uint8_t> read_data;
std::vector<ggml_context *> ctx_metas;
std::vector<gguf_context *> ctx_ggufs;
char split_path[PATH_MAX] = {0};
strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1);
char split_prefix[PATH_MAX] = {0};
// First pass to find KV and tensors metadata
for (int i_split = 0; i_split < n_split; i_split++) {
struct ggml_context * ctx_meta = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
};
if (i_split > 0) {
llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
}
fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);
auto * ctx_gguf = gguf_init_from_file(split_path, params);
if (!ctx_gguf) {
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
exit(EXIT_FAILURE);
}
ctx_ggufs.push_back(ctx_gguf);
ctx_metas.push_back(ctx_meta);
if (i_split == 0) {
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
if (key_n_split < 0) {
fprintf(stderr,
"\n%s: input file does not contain %s metadata\n",
__func__,
LLM_KV_SPLIT_COUNT);
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(EXIT_FAILURE);
}
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
if (n_split < 1) {
fprintf(stderr,
"\n%s: input file does not contain a valid split count %d\n",
__func__,
n_split);
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(EXIT_FAILURE);
}
// Verify the file naming and extract split_prefix
if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
fprintf(stderr, "\n%s: unexpected input file name: %s"
" i_split=%d"
" n_split=%d\n", __func__,
split_path, i_split, n_split);
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(EXIT_FAILURE);
}
// Do not trigger merge if we try to merge again the output
gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
// Set metadata from the first split
gguf_set_kv(ctx_out, ctx_gguf);
}
auto n_tensors = gguf_get_n_tensors(ctx_gguf);
for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
gguf_add_tensor(ctx_out, t);
}
total_tensors += n_tensors;
fprintf(stderr, "\033[3Ddone\n");
}
// placeholder for the meta data
{
auto meta_size = gguf_get_meta_size(ctx_out);
::zeros(fout, meta_size);
}
// Write tensors data
for (int i_split = 0; i_split < n_split; i_split++) {
llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
std::ifstream f_input(split_path, std::ios::binary);
if (!f_input.is_open()) {
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_path);
for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
gguf_free(ctx_ggufs[i]);
ggml_free(ctx_metas[i]);
}
gguf_free(ctx_out);
fout.close();
exit(EXIT_FAILURE);
}
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
auto * ctx_gguf = ctx_ggufs[i_split];
auto * ctx_meta = ctx_metas[i_split];
auto n_tensors = gguf_get_n_tensors(ctx_gguf);
for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
auto n_bytes = ggml_nbytes(t);
if (read_data.size() < n_bytes) {
read_data.resize(n_bytes);
}
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
f_input.seekg(offset);
f_input.read((char *)read_data.data(), n_bytes);
// write tensor data + padding
fout.write((const char *)read_data.data(), n_bytes);
zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
}
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
f_input.close();
fprintf(stderr, "\033[3Ddone\n");
}
{
// go back to beginning of file and write the updated metadata
fout.seekp(0);
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
gguf_get_meta_data(ctx_out, data.data());
fout.write((const char *)data.data(), data.size());
fout.close();
gguf_free(ctx_out);
}
fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n",
__func__, split_params.output.c_str(), n_split, total_tensors);
}
int main(int argc, const char ** argv) {
if (argc < 3) {
split_print_usage(argv[0]);
}
split_params params;
split_params_parse(argc, argv, params);
switch (params.operation) {
case SPLIT_OP_SPLIT: gguf_split(params);
break;
case SPLIT_OP_MERGE: gguf_merge(params);
break;
default: split_print_usage(argv[0]);
exit(EXIT_FAILURE);
}
return 0;
}

View File

@@ -211,7 +211,6 @@ static bool gguf_ex_read_1(const std::string & fname) {
for (int j = 0; j < ggml_nelements(cur); ++j) {
if (data[j] != 100 + i) {
fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
gguf_free(ctx);
return false;
}
}

View File

@@ -1,5 +0,0 @@
set(TARGET gritlm)
add_executable(${TARGET} gritlm.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,62 +0,0 @@
## Generative Representational Instruction Tuning (GRIT) Example
[gritlm] a model which can generate embeddings as well as "normal" text
generation depending on the instructions in the prompt.
* Paper: https://arxiv.org/pdf/2402.09906.pdf
### Retrieval-Augmented Generation (RAG) use case
One use case for `gritlm` is to use it with RAG. If we recall how RAG works is
that we take documents that we want to use as context, to ground the large
language model (LLM), and we create token embeddings for them. We then store
these token embeddings in a vector database.
When we perform a query, prompt the LLM, we will first create token embeddings
for the query and then search the vector database to retrieve the most
similar vectors, and return those documents so they can be passed to the LLM as
context. Then the query and the context will be passed to the LLM which will
have to _again_ create token embeddings for the query. But because gritlm is used
the first query can be cached and the second query tokenization generation does
not have to be performed at all.
### Running the example
Download a Grit model:
```console
$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
```
Run the example using the downloaded model:
```console
$ ./gritlm -m gritlm-7b_q4_1.gguf
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112
Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547
Oh, brave adventurer, who dared to climb
The lofty peak of Mt. Fuji in the night,
When shadows lurk and ghosts do roam,
And darkness reigns, a fearsome sight.
Thou didst set out, with heart aglow,
To conquer this mountain, so high,
And reach the summit, where the stars do glow,
And the moon shines bright, up in the sky.
Through the mist and fog, thou didst press on,
With steadfast courage, and a steadfast will,
Through the darkness, thou didst not be gone,
But didst climb on, with a steadfast skill.
At last, thou didst reach the summit's crest,
And gazed upon the world below,
And saw the beauty of the night's best,
And felt the peace, that only nature knows.
Oh, brave adventurer, who dared to climb
The lofty peak of Mt. Fuji in the night,
Thou art a hero, in the eyes of all,
For thou didst conquer this mountain, so bright.
```
[gritlm]: https://github.com/ContextualAI/gritlm

View File

@@ -1,215 +0,0 @@
#include "common.h"
#include "llama.h"
#include <string>
#include <vector>
// #define GRIT_DEBUG
static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
std::vector<std::vector<float>> result;
const llama_model * mdl = llama_get_model(ctx);
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
for (uint64_t i = 0; i < sentences.size(); i++) {
llama_batch_clear(batch);
const std::string input_string = instruction + sentences[i];
std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);
const int32_t n_toks = inputs.size();
// GritLM seems to have EOS = ""
// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
// inputs.push_back(llama_token_eos(mdl));
// we want to ignore instruction tokens for mean pooling
const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size();
#ifdef GRIT_DEBUG
// debug tokens - should be matching as referenced in the GritLM sample
std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
});
std::printf("\n");
#endif
// add input to batch (this increments n_tokens)
for (int32_t j = 0; j < n_toks; j++) {
llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
}
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx);
llama_set_causal_attn(ctx, false);
// run model
llama_decode(ctx, batch);
// get embedding dimensions
uint64_t n_embd = llama_n_embd(mdl);
// allocate embedding output
std::vector<float> emb_unorm(n_embd, 0.0f);
// sum up all token embeddings
for (int32_t k = n_inst; k < n_toks; k++) {
float * emb = llama_get_embeddings_ith(ctx, k);
for (uint64_t j = 0; j < n_embd; j++) {
emb_unorm[j] += emb[j];
}
}
// divide by number of tokens (mean pooling)
{
const uint64_t n_sent = n_toks - n_inst;
for (uint64_t j = 0; j < n_embd; j++) {
emb_unorm[j] /= n_sent;
}
}
std::vector<float> emb_norm(emb_unorm.size());
llama_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
result.push_back(emb_norm);
#ifdef GRIT_DEBUG
// print out emb_norm
std::printf("embedding %ld: ", i);
for (uint64_t j = 0; j < n_embd; j++) {
std::printf("%.5f ", emb_norm[j]);
}
std::printf("\n\n");
#endif
}
llama_batch_free(batch);
return result;
}
static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) {
std::string result;
const llama_model * mdl = llama_get_model(ctx);
llama_token eos_token = llama_token_eos(mdl);
llama_kv_cache_clear(ctx);
llama_set_causal_attn(ctx, true);
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
int32_t i_current_token = 0;
while (true) {
llama_batch_clear(bat);
auto n_inputs = (int32_t)inputs.size();
for (int32_t i = 0; i < n_inputs; i++) {
llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
}
inputs.clear();
llama_decode(ctx, bat);
auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
auto candidates = std::vector<llama_token_data>(llama_n_vocab(mdl));
auto n_candidates = (int32_t)candidates.size();
for (int32_t token = 0; token < n_candidates; token++) {
candidates[token] = llama_token_data{ token, logits[token], 0.0f };
}
auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };
llama_token token = llama_sample_token_greedy(ctx, &candidates_p);
if (token == eos_token) {
break;
}
std::string piece = llama_token_to_piece(ctx, token);
if (stream) {
std::printf("%s", piece.c_str());
std::fflush(stdout);
}
inputs.push_back(token);
result += piece;
}
if (stream) {
std::printf("\n");
}
llama_batch_free(bat);
return result;
}
static std::string gritlm_instruction(const std::string & instruction) {
return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n";
}
int main(int argc, char * argv[]) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}
llama_model_params mparams = llama_model_params_from_gpt_params(params);
llama_context_params cparams = llama_context_params_from_gpt_params(params);
llama_backend_init();
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
// create new context - set to embedding mode
cparams.embeddings = true;
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
// ### Embedding/Representation ###
// samples taken from: https://github.com/ContextualAI/gritlm#basic
{
const std::string instruction = "Given a scientific paper title, retrieve the paper's abstract";
const std::vector<std::string> queries = {
"Bitcoin: A Peer-to-Peer Electronic Cash System",
"Generative Representational Instruction Tuning",
};
const std::vector<std::string> documents = {
"A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
"All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
};
// No need to add instruction for retrieval documents
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
const int n_embd = llama_n_embd(mdl);
const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[0].c_str(), cosine_sim_q1_d0);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
}
// ### Generation ###
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
{
const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
std::string response = generate(ctx, prompt, true);
}
llama_free(ctx);
llama_free_model(mdl);
llama_backend_free();
return 0;
}

View File

@@ -22,7 +22,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
## Example
```bash
LLAMA_CUDA=1 make -j
LLAMA_CUBLAS=1 make -j
# generate importance matrix (imatrix.dat)
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99

View File

@@ -50,31 +50,11 @@ private:
void keep_imatrix(int ncall) const;
};
// remove any prefix and suffixes from the name
// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
static std::string filter_tensor_name(const char * name) {
std::string wname;
const char * p = strchr(name, '#');
if (p != NULL) {
p = p + 1;
const char * q = strchr(p, '#');
if (q != NULL) {
wname = std::string(p, q - p);
} else {
wname = p;
}
} else {
wname = name;
}
return wname;
}
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
GGML_UNUSED(user_data);
const struct ggml_tensor * src0 = t->src[0];
const struct ggml_tensor * src1 = t->src[1];
std::string wname = filter_tensor_name(src0->name);
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
@@ -82,7 +62,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
if (t->op != GGML_OP_MUL_MAT) return false;
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
return true;
}
@@ -114,13 +94,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
// this is necessary to guarantee equal number of "ncall" for each tensor
for (int ex = 0; ex < n_as; ++ex) {
src0 = t->src[2 + ex];
wname = filter_tensor_name(src0->name);
auto& e = m_stats[wname];
auto& e = m_stats[src0->name];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ASSERT(false);
}
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
@@ -128,7 +107,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
++e.ncall;
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
}
for (int row = 0; row < (int)src1->ne[1]; ++row) {
const int excur = m_ids[row*n_as + idx];
@@ -150,17 +129,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
}
} else {
auto& e = m_stats[wname];
auto& e = m_stats[src0->name];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ASSERT(false);
}
++e.ncall;
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
}
for (int row = 0; row < (int)src1->ne[1]; ++row) {
const float * x = data + row * src1->ne[0];
@@ -424,7 +403,6 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}
// TODO: use batch.logits to save computations instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;

View File

@@ -378,10 +378,10 @@ int main(int argc, char ** argv) {
if (params.interactive) {
const char *control_message;
if (params.multiline_input) {
control_message = " - To return control to LLaMA, end your input with '\\'.\n"
control_message = " - To return control to LLaMa, end your input with '\\'.\n"
" - To return control without starting a new line, end your input with '/'.\n";
} else {
control_message = " - Press Return to return control to LLaMA.\n"
control_message = " - Press Return to return control to LLaMa.\n"
" - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n";
}
@@ -447,8 +447,8 @@ int main(int argc, char ** argv) {
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
n_past -= n_discard;

View File

@@ -1,74 +0,0 @@
# Usage:
#! ./server -m some-model.gguf &
#! pip install pydantic
#! python json-schema-pydantic-example.py
from pydantic import BaseModel, TypeAdapter
from annotated_types import MinLen
from typing import Annotated, List, Optional
import json, requests
if True:
def create_completion(*, response_model=None, endpoint="http://localhost:8080/v1/chat/completions", messages, **kwargs):
'''
Creates a chat completion using an OpenAI-compatible endpoint w/ JSON schema support
(llama.cpp server, llama-cpp-python, Anyscale / Together...)
The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
'''
if response_model:
type_adapter = TypeAdapter(response_model)
schema = type_adapter.json_schema()
messages = [{
"role": "system",
"content": f"You respond in JSON format with the following schema: {json.dumps(schema, indent=2)}"
}] + messages
response_format={"type": "json_object", "schema": schema}
data = requests.post(endpoint, headers={"Content-Type": "application/json"},
json=dict(messages=messages, response_format=response_format, **kwargs)).json()
if 'error' in data:
raise Exception(data['error']['message'])
content = data["choices"][0]["message"]["content"]
return type_adapter.validate_json(content) if type_adapter else content
else:
# This alternative branch uses Instructor + OpenAI client lib.
# Instructor support streamed iterable responses, retry & more.
# (see https://python.useinstructor.com/)
#! pip install instructor openai
import instructor, openai
client = instructor.patch(
openai.OpenAI(api_key="123", base_url="http://localhost:8080"),
mode=instructor.Mode.JSON_SCHEMA)
create_completion = client.chat.completions.create
if __name__ == '__main__':
class QAPair(BaseModel):
question: str
concise_answer: str
justification: str
class PyramidalSummary(BaseModel):
title: str
summary: str
question_answers: Annotated[List[QAPair], MinLen(2)]
sub_sections: Optional[Annotated[List['PyramidalSummary'], MinLen(2)]]
print("# Summary\n", create_completion(
model="...",
response_model=PyramidalSummary,
messages=[{
"role": "user",
"content": f"""
You are a highly efficient corporate document summarizer.
Create a pyramidal summary of an imaginary internal document about our company processes
(starting high-level, going down to each sub sections).
Keep questions short, and answers even shorter (trivia / quizz style).
"""
}]))

View File

@@ -1,10 +1,8 @@
#!/usr/bin/env python3
import argparse
import itertools
import json
import re
import sys
from typing import Any, Dict, List, Set, Tuple, Union
# whitespace is constrained to a single space char to prevent model "running away" in
# whitespace. Also maybe improves generation quality?
@@ -14,54 +12,26 @@ PRIMITIVE_RULES = {
'boolean': '("true" | "false") space',
'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
'value' : 'object | array | string | number | boolean',
'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
'array' : '"[" space ( value ("," space value)* )? "]" space',
'uuid' : '"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space',
'string': r''' "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space''',
)* "\"" space ''',
'null': '"null" space',
}
OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value']
# TODO: support "uri", "email" string formats
DATE_RULES = {
'date' : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
'time' : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
'date-time': 'date "T" time',
'date-string': '"\\"" date "\\"" space',
'time-string': '"\\"" time "\\"" space',
'date-time-string': '"\\"" date-time "\\"" space',
}
RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *DATE_RULES.keys()])
INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'}
NON_LITERAL_SET = set('|.()[]{}*+?')
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits
class SchemaConverter:
def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
def __init__(self, prop_order):
self._prop_order = prop_order
self._allow_fetch = allow_fetch
self._dotall = dotall
self._raw_pattern = raw_pattern
self._rules = {'space': SPACE_RULE}
self._refs = {}
self._refs_being_resolved = set()
def _format_literal(self, literal):
escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
)
return f'"{escaped}"'
@@ -71,420 +41,78 @@ class SchemaConverter:
key = esc_name
else:
i = 0
while f'{esc_name}{i}' in self._rules and self._rules[f'{esc_name}{i}'] != rule:
while f'{esc_name}{i}' in self._rules:
i += 1
key = f'{esc_name}{i}'
self._rules[key] = rule
return key
def resolve_refs(self, schema: dict, url: str):
'''
Resolves all $ref fields in the given schema, fetching any remote schemas,
replacing $ref with absolute reference URL and populating self._refs with the
respective referenced (sub)schema dictionaries.
'''
def visit(n: dict):
if isinstance(n, list):
return [visit(x) for x in n]
elif isinstance(n, dict):
ref = n.get('$ref')
if ref is not None and ref not in self._refs:
if ref.startswith('https://'):
assert self._allow_fetch, 'Fetching remote schemas is not allowed (use --allow-fetch for force)'
import requests
frag_split = ref.split('#')
base_url = frag_split[0]
target = self._refs.get(base_url)
if target is None:
target = self.resolve_refs(requests.get(ref).json(), base_url)
self._refs[base_url] = target
if len(frag_split) == 1 or frag_split[-1] == '':
return target
elif ref.startswith('#/'):
target = schema
ref = f'{url}{ref}'
n['$ref'] = ref
else:
raise ValueError(f'Unsupported ref {ref}')
for sel in ref.split('#')[-1].split('/')[1:]:
assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
target = target[sel]
self._refs[ref] = target
else:
for v in n.values():
visit(v)
return n
return visit(schema)
def _generate_union_rule(self, name, alt_schemas):
return ' | '.join((
self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}')
for i, alt_schema in enumerate(alt_schemas)
))
def _visit_pattern(self, pattern, name):
'''
Transforms a regular expression pattern into a GBNF rule.
Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions
Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers.
Mostly a 1:1 translation, except for {x} / {x,} / {x,y} quantifiers for which
we define sub-rules to keep the output lean.
'''
assert pattern.startswith('^') and pattern.endswith('$'), 'Pattern must start with "^" and end with "$"'
pattern = pattern[1:-1]
sub_rule_ids = {}
i = 0
length = len(pattern)
def to_rule(s: Tuple[str, bool]) -> str:
(txt, is_literal) = s
return "\"" + txt + "\"" if is_literal else txt
def transform() -> Tuple[str, bool]:
'''
Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
'''
nonlocal i
nonlocal pattern
nonlocal sub_rule_ids
start = i
# For each component of this sequence, store its string representation and whether it's a literal.
# We only need a flat structure here to apply repetition operators to the last item, and
# to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
# (GBNF's syntax is luckily very close to regular expressions!)
seq: list[Tuple[str, bool]] = []
def get_dot():
if self._dotall:
rule = '[\\U00000000-\\U0010FFFF]'
else:
# Accept any character... except \n and \r line break chars (\x0A and \xOD)
rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]'
return self._add_rule(f'dot', rule)
def join_seq():
nonlocal seq
ret = []
for is_literal, g in itertools.groupby(seq, lambda x: x[1]):
if is_literal:
ret.append((''.join(x[0] for x in g), True))
else:
ret.extend(g)
if len(ret) == 1:
return ret[0]
return (' '.join(to_rule(x) for x in seq), False)
while i < length:
c = pattern[i]
if c == '.':
seq.append((get_dot(), False))
i += 1
elif c == '(':
i += 1
if i < length:
assert pattern[i] != '?', f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/'
seq.append((f'({to_rule(transform())})', False))
elif c == ')':
i += 1
assert start > 0 and pattern[start-1] == '(', f'Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}'
return join_seq()
elif c == '[':
square_brackets = c
i += 1
while i < length and pattern[i] != ']':
if pattern[i] == '\\':
square_brackets += pattern[i:i+2]
i += 2
else:
square_brackets += pattern[i]
i += 1
assert i < length, f'Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}'
square_brackets += ']'
i += 1
seq.append((square_brackets, False))
elif c == '|':
seq.append(('|', False))
i += 1
elif c in ('*', '+', '?'):
seq[-1] = (to_rule(seq[-1]) + c, False)
i += 1
elif c == '{':
curly_brackets = c
i += 1
while i < length and pattern[i] != '}':
curly_brackets += pattern[i]
i += 1
assert i < length, f'Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}'
curly_brackets += '}'
i += 1
nums = [s.strip() for s in curly_brackets[1:-1].split(',')]
min_times = 0
max_times = None
try:
if len(nums) == 1:
min_times = int(nums[0])
max_times = min_times
else:
assert len(nums) == 2
min_times = int(nums[0]) if nums[0] else 0
max_times = int(nums[1]) if nums[1] else None
except ValueError:
raise ValueError(f'Invalid quantifier {curly_brackets} in /{pattern}/')
(sub, sub_is_literal) = seq[-1]
if min_times == 0 and max_times is None:
seq[-1] = (f'{sub}*', False)
elif min_times == 0 and max_times == 1:
seq[-1] = (f'{sub}?', False)
elif min_times == 1 and max_times is None:
seq[-1] = (f'{sub}+', False)
else:
if not sub_is_literal:
id = sub_rule_ids.get(sub)
if id is None:
id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
sub_rule_ids[sub] = id
sub = id
seq[-1] = (
' '.join(
([f'"{sub[1:-1] * min_times}"'] if sub_is_literal else [sub] * min_times) +
([f'{sub}?'] * (max_times - min_times) if max_times is not None else [f'{sub}*'])),
False
)
else:
literal = ''
while i < length:
if pattern[i] == '\\' and i < length - 1:
next = pattern[i + 1]
if next in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS:
i += 1
literal += pattern[i]
i += 1
else:
literal += pattern[i:i+2]
i += 2
elif pattern[i] == '"' and not self._raw_pattern:
literal += '\\"'
i += 1
elif pattern[i] not in NON_LITERAL_SET and \
(i == length - 1 or literal == '' or pattern[i+1] == '.' or pattern[i+1] not in NON_LITERAL_SET):
literal += pattern[i]
i += 1
else:
break
if literal:
seq.append((literal, True))
return join_seq()
return self._add_rule(
name,
to_rule(transform()) if self._raw_pattern \
else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
def _resolve_ref(self, ref):
ref_name = ref.split('/')[-1]
if ref_name not in self._rules and ref not in self._refs_being_resolved:
self._refs_being_resolved.add(ref)
resolved = self._refs[ref]
ref_name = self.visit(resolved, ref_name)
self._refs_being_resolved.remove(ref)
return ref_name
def _generate_constant_rule(self, value):
return self._format_literal(json.dumps(value))
def visit(self, schema, name):
schema_type = schema.get('type')
schema_format = schema.get('format')
rule_name = name + '-' if name in RESERVED_NAMES else name or 'root'
rule_name = name or 'root'
if (ref := schema.get('$ref')) is not None:
return self._add_rule(rule_name, self._resolve_ref(ref))
elif 'oneOf' in schema or 'anyOf' in schema:
return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf']))
elif isinstance(schema_type, list):
return self._add_rule(rule_name, self._generate_union_rule(name, [{'type': t} for t in schema_type]))
elif 'const' in schema:
return self._add_rule(rule_name, self._generate_constant_rule(schema['const']))
elif 'enum' in schema:
rule = ' | '.join((self._generate_constant_rule(v) for v in schema['enum']))
if 'oneOf' in schema or 'anyOf' in schema:
rule = ' | '.join((
self.visit(alt_schema, f'{name}{"-" if name else ""}{i}')
for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf'])
))
return self._add_rule(rule_name, rule)
elif schema_type in (None, 'object') and \
('properties' in schema or \
('additionalProperties' in schema and schema['additionalProperties'] is not True)):
required = set(schema.get('required', []))
properties = list(schema.get('properties', {}).items())
return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties')))
elif 'const' in schema:
return self._add_rule(rule_name, self._format_literal(schema['const']))
elif schema_type in (None, 'object') and 'allOf' in schema:
required = set()
properties = []
hybrid_name = name
def add_component(comp_schema, is_required):
if (ref := comp_schema.get('$ref')) is not None:
comp_schema = self._refs[ref]
elif 'enum' in schema:
rule = ' | '.join((self._format_literal(v) for v in schema['enum']))
return self._add_rule(rule_name, rule)
if 'properties' in comp_schema:
for prop_name, prop_schema in comp_schema['properties'].items():
properties.append((prop_name, prop_schema))
if is_required:
required.add(prop_name)
for t in schema['allOf']:
if 'anyOf' in t:
for tt in t['anyOf']:
add_component(tt, is_required=False)
else:
add_component(t, is_required=True)
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[]))
elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
items = schema.get('items') or schema['prefixItems']
if isinstance(items, list):
return self._add_rule(
rule_name,
'"[" space ' +
' "," space '.join(
self.visit(item, f'{name}{"-" if name else ""}tuple-{i}')
for i, item in enumerate(items)) +
' "]" space')
else:
item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
list_item_operator = f'( "," space {item_rule_name} )'
successive_items = ""
min_items = schema.get("minItems", 0)
max_items = schema.get("maxItems")
if min_items > 0:
successive_items = list_item_operator * (min_items - 1)
min_items -= 1
if max_items is not None and max_items > min_items:
successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
else:
successive_items += list_item_operator + "*"
if min_items == 0:
rule = f'"[" space ( {item_rule_name} {successive_items} )? "]" space'
else:
rule = f'"[" space {item_rule_name} {successive_items} "]" space'
return self._add_rule(rule_name, rule)
elif schema_type in (None, 'string') and 'pattern' in schema:
return self._visit_pattern(schema['pattern'], rule_name)
elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
return self._add_rule(
'root' if rule_name == 'root' else schema_format,
PRIMITIVE_RULES['uuid']
elif schema_type == 'object' and 'properties' in schema:
# TODO: `required` keyword
prop_order = self._prop_order
prop_pairs = sorted(
schema['properties'].items(),
# sort by position in prop_order (if specified) then by key
key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
)
elif schema_type in (None, 'string') and schema_format in DATE_RULES:
for t, r in DATE_RULES.items():
self._add_rule(t, r)
return schema_format + '-string'
rule = '"{" space'
for i, (prop_name, prop_schema) in enumerate(prop_pairs):
prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
if i > 0:
rule += ' "," space'
rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}'
rule += ' "}" space'
elif (schema_type == 'object') or (len(schema) == 0):
for n in OBJECT_RULE_NAMES:
self._add_rule(n, PRIMITIVE_RULES[n])
return self._add_rule(rule_name, 'object')
return self._add_rule(rule_name, rule)
elif schema_type == 'array' and 'items' in schema:
# TODO `prefixItems` keyword
item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
list_item_operator = f'("," space {item_rule_name})'
successive_items = ""
min_items = schema.get("minItems", 0)
if min_items > 0:
first_item = f"({item_rule_name})"
successive_items = list_item_operator * (min_items - 1)
min_items -= 1
else:
first_item = f"({item_rule_name})?"
max_items = schema.get("maxItems")
if max_items is not None and max_items > min_items:
successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
else:
successive_items += list_item_operator + "*"
rule = f'"[" space {first_item} {successive_items} "]" space'
return self._add_rule(rule_name, rule)
else:
assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
# TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
return self._add_rule(
'root' if rule_name == 'root' else schema_type,
PRIMITIVE_RULES[schema_type]
)
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
prop_order = self._prop_order
# sort by position in prop_order (if specified) then by original order
sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))]
prop_kv_rule_names = {}
for prop_name, prop_schema in properties:
prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
prop_kv_rule_names[prop_name] = self._add_rule(
f'{name}{"-" if name else ""}{prop_name}-kv',
fr'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}'
)
required_props = [k for k in sorted_props if k in required]
optional_props = [k for k in sorted_props if k not in required]
if additional_properties == True or isinstance(additional_properties, dict):
sub_name = f'{name}{"-" if name else ""}additional'
value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
prop_kv_rule_names["*"] = self._add_rule(
f'{sub_name}-kv',
self._add_rule('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
)
optional_props.append("*")
rule = '"{" space '
rule += ' "," space '.join(prop_kv_rule_names[k] for k in required_props)
if optional_props:
rule += ' ('
if required_props:
rule += ' "," space ( '
def get_recursive_refs(ks, first_is_optional):
[k, *rest] = ks
kv_rule_name = prop_kv_rule_names[k]
if k == '*':
res = self._add_rule(
f'{name}{"-" if name else ""}additional-kvs',
f'{kv_rule_name} ( "," space ' + kv_rule_name + ' )*'
)
elif first_is_optional:
res = f'( "," space {kv_rule_name} )?'
else:
res = kv_rule_name
if len(rest) > 0:
res += ' ' + self._add_rule(
f'{name}{"-" if name else ""}{k}-rest',
get_recursive_refs(rest, first_is_optional=True)
)
return res
rule += ' | '.join(
get_recursive_refs(optional_props[i:], first_is_optional=False)
for i in range(len(optional_props))
)
if required_props:
rule += ' )'
rule += ' )?'
rule += ' "}" space'
return rule
def format_grammar(self):
return '\n'.join(
f'{name} ::= {rule}'
for name, rule in sorted(self._rules.items(), key=lambda kv: kv[0])
)
return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items()))
def main(args_in = None):
@@ -501,47 +129,16 @@ def main(args_in = None):
type=lambda s: s.split(','),
help='''
comma-separated property names defining the order of precedence for object properties;
properties not specified here are given lower precedence than those that are, and
are kept in their original order from the schema. Required properties are always
given precedence over optional properties.
properties not specified here are given lower precedence than those that are, and are
sorted alphabetically
'''
)
parser.add_argument(
'--allow-fetch',
action='store_true',
default=False,
help='Whether to allow fetching referenced schemas over HTTPS')
parser.add_argument(
'--dotall',
action='store_true',
default=False,
help='Whether to treat dot (".") as matching all chars including line breaks in regular expression patterns')
parser.add_argument(
'--raw-pattern',
action='store_true',
default=False,
help='Treats string patterns as raw patterns w/o quotes (or quote escapes)')
parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
args = parser.parse_args(args_in)
if args.schema.startswith('https://'):
url = args.schema
import requests
schema = requests.get(url).json()
elif args.schema == '-':
url = 'stdin'
schema = json.load(sys.stdin)
else:
url = f'file://{args.schema}'
with open(args.schema) as f:
schema = json.load(f)
converter = SchemaConverter(
prop_order={name: idx for idx, name in enumerate(args.prop_order)},
allow_fetch=args.allow_fetch,
dotall=args.dotall,
raw_pattern=args.raw_pattern)
schema = converter.resolve_refs(schema, url)
schema = json.load(sys.stdin if args.schema == '-' else open(args.schema))
prop_order = {name: idx for idx, name in enumerate(args.prop_order)}
converter = SchemaConverter(prop_order)
converter.visit(schema, '')
print(converter.format_grammar())

View File

@@ -35,6 +35,7 @@ options:
-mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0)
-mmp, --mmap <0|1> (default: 1)
-mmq, --mul-mat-q <0|1> (default: 1)
-ts, --tensor_split <ts0/ts1/..> (default: 0)
-r, --repetitions <n> (default: 5)
-o, --output <csv|json|md|sql> (default: md)

View File

@@ -8,7 +8,6 @@
#include <cstdio>
#include <cstring>
#include <ctime>
#include <cstdlib>
#include <iterator>
#include <map>
#include <numeric>
@@ -104,7 +103,6 @@ static std::string get_cpu_info() {
}
}
}
fclose(f);
}
#endif
// TODO: other platforms
@@ -113,11 +111,11 @@ static std::string get_cpu_info() {
static std::string get_gpu_info() {
std::string id;
#ifdef GGML_USE_CUDA
int count = ggml_backend_cuda_get_device_count();
#ifdef GGML_USE_CUBLAS
int count = ggml_cuda_get_device_count();
for (int i = 0; i < count; i++) {
char buf[128];
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
ggml_cuda_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
id += "/";
@@ -125,15 +123,20 @@ static std::string get_gpu_info() {
}
#endif
#ifdef GGML_USE_SYCL
int count = ggml_backend_sycl_get_device_count();
for (int i = 0; i < count; i++) {
char buf[128];
ggml_sycl_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
int device_list[GGML_SYCL_MAX_DEVICES];
ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
if (device_list[i] >0 ){
char buf[128];
ggml_sycl_get_device_description(i, buf, sizeof(buf));
id += buf;
id += "/";
}
}
if (id.length() >2 ) {
id.pop_back();
}
#endif
// TODO: other backends
return id;
@@ -154,9 +157,9 @@ static const char * output_format_str(output_formats format) {
static const char * split_mode_str(llama_split_mode mode) {
switch (mode) {
case LLAMA_SPLIT_MODE_NONE: return "none";
case LLAMA_SPLIT_MODE_LAYER: return "layer";
case LLAMA_SPLIT_MODE_ROW: return "row";
case LLAMA_SPLIT_NONE: return "none";
case LLAMA_SPLIT_LAYER: return "layer";
case LLAMA_SPLIT_ROW: return "row";
default: GGML_ASSERT(!"invalid split mode");
}
}
@@ -166,7 +169,6 @@ struct cmd_params {
std::vector<int> n_prompt;
std::vector<int> n_gen;
std::vector<int> n_batch;
std::vector<int> n_ubatch;
std::vector<ggml_type> type_k;
std::vector<ggml_type> type_v;
std::vector<int> n_threads;
@@ -174,9 +176,9 @@ struct cmd_params {
std::vector<llama_split_mode> split_mode;
std::vector<int> main_gpu;
std::vector<bool> no_kv_offload;
std::vector<bool> mul_mat_q;
std::vector<std::vector<float>> tensor_split;
std::vector<bool> use_mmap;
std::vector<bool> embeddings;
int reps;
bool verbose;
output_formats output_format;
@@ -186,18 +188,17 @@ static const cmd_params cmd_params_defaults = {
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
/* n_prompt */ {512},
/* n_gen */ {128},
/* n_batch */ {2048},
/* n_ubatch */ {512},
/* n_batch */ {512},
/* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16},
/* n_threads */ {get_num_physical_cores()},
/* n_gpu_layers */ {99},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* split_mode */ {LLAMA_SPLIT_LAYER},
/* main_gpu */ {0},
/* no_kv_offload */ {false},
/* mul_mat_q */ {true},
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* use_mmap */ {true},
/* embeddings */ {false},
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN
@@ -212,7 +213,6 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ub N, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
@@ -221,8 +221,8 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
@@ -249,9 +249,6 @@ static ggml_type ggml_type_from_name(const std::string & s) {
if (s == "q5_1") {
return GGML_TYPE_Q5_1;
}
if (s == "iq4_nl") {
return GGML_TYPE_IQ4_NL;
}
return GGML_TYPE_COUNT;
}
@@ -305,13 +302,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split<int>(argv[i], split_delim);
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
} else if (arg == "-ub" || arg == "--ubatch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
} else if (arg == "-ctk" || arg == "--cache-type-k") {
if (++i >= argc) {
invalid_param = true;
@@ -368,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
for (const auto & m : p) {
llama_split_mode mode;
if (m == "none") {
mode = LLAMA_SPLIT_MODE_NONE;
mode = LLAMA_SPLIT_NONE;
} else if (m == "layer") {
mode = LLAMA_SPLIT_MODE_LAYER;
mode = LLAMA_SPLIT_LAYER;
} else if (m == "row") {
mode = LLAMA_SPLIT_MODE_ROW;
mode = LLAMA_SPLIT_ROW;
} else {
invalid_param = true;
break;
@@ -393,6 +383,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split<bool>(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
} else if (arg == "-mmq" || arg == "--mul-mat-q") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
} else if (arg == "-mmp" || arg == "--mmap") {
if (++i >= argc) {
invalid_param = true;
@@ -400,13 +397,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split<bool>(argv[i], split_delim);
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
} else if (arg == "-embd" || arg == "--embeddings") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
} else if (arg == "-ts" || arg == "--tensor-split") {
if (++i >= argc) {
invalid_param = true;
@@ -470,16 +460,15 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
return params;
@@ -490,7 +479,6 @@ struct cmd_params_instance {
int n_prompt;
int n_gen;
int n_batch;
int n_ubatch;
ggml_type type_k;
ggml_type type_v;
int n_threads;
@@ -498,9 +486,9 @@ struct cmd_params_instance {
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool mul_mat_q;
std::vector<float> tensor_split;
bool use_mmap;
bool embeddings;
llama_model_params to_llama_mparams() const {
llama_model_params mparams = llama_model_default_params();
@@ -528,11 +516,10 @@ struct cmd_params_instance {
cparams.n_ctx = n_prompt + n_gen;
cparams.n_batch = n_batch;
cparams.n_ubatch = n_ubatch;
cparams.type_k = type_k;
cparams.type_v = type_v;
cparams.mul_mat_q = mul_mat_q;
cparams.offload_kqv = !no_kv_offload;
cparams.embeddings = embeddings;
return cparams;
}
@@ -548,11 +535,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split)
for (const auto & mmp : params.use_mmap)
for (const auto & embd : params.embeddings)
for (const auto & nb : params.n_batch)
for (const auto & nub : params.n_ubatch)
for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v)
for (const auto & mmq : params.mul_mat_q)
for (const auto & nkvo : params.no_kv_offload)
for (const auto & nt : params.n_threads) {
for (const auto & n_prompt : params.n_prompt) {
@@ -564,7 +550,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .n_prompt = */ n_prompt,
/* .n_gen = */ 0,
/* .n_batch = */ nb,
/* .n_ubatch = */ nub,
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
@@ -572,9 +557,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
};
instances.push_back(instance);
}
@@ -588,7 +573,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .n_prompt = */ 0,
/* .n_gen = */ n_gen,
/* .n_batch = */ nb,
/* .n_ubatch = */ nub,
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
@@ -596,9 +580,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
};
instances.push_back(instance);
}
@@ -625,7 +609,6 @@ struct test {
uint64_t model_size;
uint64_t model_n_params;
int n_batch;
int n_ubatch;
int n_threads;
ggml_type type_k;
ggml_type type_v;
@@ -633,9 +616,9 @@ struct test {
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool mul_mat_q;
std::vector<float> tensor_split;
bool use_mmap;
bool embeddings;
int n_prompt;
int n_gen;
std::string test_time;
@@ -649,7 +632,6 @@ struct test {
model_size = llama_model_size(lmodel);
model_n_params = llama_model_n_params(lmodel);
n_batch = inst.n_batch;
n_ubatch = inst.n_ubatch;
n_threads = inst.n_threads;
type_k = inst.type_k;
type_v = inst.type_v;
@@ -657,9 +639,9 @@ struct test {
split_mode = inst.split_mode;
main_gpu = inst.main_gpu;
no_kv_offload = inst.no_kv_offload;
mul_mat_q = inst.mul_mat_q;
tensor_split = inst.tensor_split;
use_mmap = inst.use_mmap;
embeddings = inst.embeddings;
n_prompt = inst.n_prompt;
n_gen = inst.n_gen;
// RFC 3339 date-time format
@@ -728,11 +710,10 @@ struct test {
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_ubatch",
"n_threads", "type_k", "type_v",
"n_batch", "n_threads", "type_k", "type_v",
"n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload",
"tensor_split", "use_mmap", "embeddings",
"mul_mat_q", "tensor_split", "use_mmap",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
"avg_ts", "stddev_ts"
@@ -743,8 +724,7 @@ struct test {
enum field_type {STRING, BOOL, INT, FLOAT};
static field_type get_field_type(const std::string & field) {
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
field == "n_threads" ||
if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
field == "model_size" || field == "model_n_params" ||
field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" ||
@@ -753,7 +733,7 @@ struct test {
}
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "use_mmap" || field == "embeddings") {
field == "mul_mat_q" || field == "use_mmap") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts") {
@@ -784,11 +764,10 @@ struct test {
std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_ubatch),
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap),
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -808,7 +787,7 @@ struct test {
const std::string test::build_commit = LLAMA_COMMIT;
const int test::build_number = LLAMA_BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cuda();
const bool test::cuda = !!ggml_cpu_has_cublas();
const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::vulkan = !!ggml_cpu_has_vulkan();
const bool test::kompute = !!ggml_cpu_has_kompute();
@@ -952,15 +931,15 @@ struct markdown_printer : public printer {
if (field == "n_threads") {
return "threads";
}
if (field == "mul_mat_q") {
return "mmq";
}
if (field == "no_kv_offload") {
return "nkvo";
}
if (field == "use_mmap") {
return "mmap";
}
if (field == "embeddings") {
return "embd";
}
if (field == "tensor_split") {
return "ts";
}
@@ -983,9 +962,6 @@ struct markdown_printer : public printer {
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
fields.emplace_back("n_batch");
}
if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) {
fields.emplace_back("n_ubatch");
}
if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
fields.emplace_back("type_k");
}
@@ -998,6 +974,9 @@ struct markdown_printer : public printer {
if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
fields.emplace_back("split_mode");
}
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
fields.emplace_back("mul_mat_q");
}
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
fields.emplace_back("no_kv_offload");
}
@@ -1007,9 +986,6 @@ struct markdown_printer : public printer {
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
fields.emplace_back("use_mmap");
}
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
fields.emplace_back("embeddings");
}
fields.emplace_back("test");
fields.emplace_back("t/s");
@@ -1125,40 +1101,25 @@ struct sql_printer : public printer {
};
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
llama_set_n_threads(ctx, n_threads, n_threads);
const llama_model * model = llama_get_model(ctx);
const int32_t n_vocab = llama_n_vocab(model);
std::vector<llama_token> tokens(n_batch);
std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
int n_processed = 0;
llama_set_n_threads(ctx, n_threads, n_threads);
while (n_processed < n_prompt) {
int n_tokens = std::min(n_prompt - n_processed, n_batch);
tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
for (int i = 1; i < n_tokens; i++) {
tokens[i] = std::rand() % n_vocab;
}
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
n_processed += n_tokens;
}
llama_synchronize(ctx);
}
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
llama_token token = llama_token_bos(llama_get_model(ctx));
llama_set_n_threads(ctx, n_threads, n_threads);
const llama_model * model = llama_get_model(ctx);
const int32_t n_vocab = llama_n_vocab(model);
llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
for (int i = 0; i < n_gen; i++) {
llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
llama_synchronize(ctx);
token = std::rand() % n_vocab;
}
}
@@ -1247,8 +1208,7 @@ int main(int argc, char ** argv) {
// warmup run
if (t.n_prompt > 0) {
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
}
if (t.n_gen > 0) {
test_gen(ctx, 1, 0, t.n_threads);
@@ -1264,7 +1224,6 @@ int main(int argc, char ** argv) {
if (t.n_gen > 0) {
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
}
uint64_t t_ns = get_time_ns() - t_start;
t.samples_ns.push_back(t_ns);
}

View File

@@ -21,8 +21,12 @@ android {
useSupportLibrary = true
}
ndk {
// Add NDK properties if wanted, e.g.
// abiFilters += listOf("arm64-v8a")
// Workaround for https://github.com/llvm/llvm-project/issues/65820
// affecting armeabi-v7a. Skip armeabi-v7a when invoked with
// -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a).
if (project.hasProperty("skip-armeabi-v7a")) {
abiFilters += listOf("arm64-v8a", "x86_64", "x86")
}
}
externalNativeBuild {
cmake {

View File

@@ -33,45 +33,6 @@ jclass la_int_var;
jmethodID la_int_var_value;
jmethodID la_int_var_inc;
std::string cached_token_chars;
bool is_valid_utf8(const char * string) {
if (!string) {
return true;
}
const unsigned char * bytes = (const unsigned char *)string;
int num;
while (*bytes != 0x00) {
if ((*bytes & 0x80) == 0x00) {
// U+0000 to U+007F
num = 1;
} else if ((*bytes & 0xE0) == 0xC0) {
// U+0080 to U+07FF
num = 2;
} else if ((*bytes & 0xF0) == 0xE0) {
// U+0800 to U+FFFF
num = 3;
} else if ((*bytes & 0xF8) == 0xF0) {
// U+10000 to U+10FFFF
num = 4;
} else {
return false;
}
bytes += 1;
for (int i = 1; i < num; ++i) {
if ((*bytes & 0xC0) != 0x80) {
return false;
}
bytes += 1;
}
}
return true;
}
static void log_callback(ggml_log_level level, const char * fmt, void * data) {
if (level == GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
@@ -334,8 +295,6 @@ Java_com_example_llama_Llm_completion_1init(
jint n_len
) {
cached_token_chars.clear();
const auto text = env->GetStringUTFChars(jtext, 0);
const auto context = reinterpret_cast<llama_context *>(context_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
@@ -413,16 +372,8 @@ Java_com_example_llama_Llm_completion_1loop(
}
auto new_token_chars = llama_token_to_piece(context, new_token_id);
cached_token_chars += new_token_chars;
jstring new_token = nullptr;
if (is_valid_utf8(cached_token_chars.c_str())) {
new_token = env->NewStringUTF(cached_token_chars.c_str());
LOGi("cached: %s, new_token_chars: `%s`, id: %d", cached_token_chars.c_str(), new_token_chars.c_str(), new_token_id);
cached_token_chars.clear();
} else {
new_token = env->NewStringUTF("");
}
LOGi("new_token_chars: `%s`", new_token_chars.c_str());
auto new_token = env->NewStringUTF(new_token_chars.c_str());
llama_batch_clear(*batch);
llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);

View File

@@ -71,7 +71,7 @@ class Llm {
batch: Long,
nLen: Int,
ncur: IntVar
): String?
): String
private external fun kv_cache_clear(context: Long)
@@ -115,7 +115,7 @@ class Llm {
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
while (ncur.value <= nlen) {
val str = completion_loop(state.context, state.batch, nlen, ncur)
if (str == null) {
if (str.isEmpty()) {
break
}
emit(str)

View File

@@ -129,7 +129,7 @@ actor LlamaContext {
for i1 in 0..<tokens_list.count {
let i = Int(i1)
llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
llama_batch_add(&batch, tokens_list[i], llama_pos(i), [0], false)
}
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
@@ -183,7 +183,7 @@ actor LlamaContext {
// tokens_list.append(new_token_id)
llama_batch_clear(&batch)
llama_batch_add(&batch, new_token_id, n_cur, [0], true)
llama_batch_add(&batch, new_token_id, llama_pos(n_cur), [0], true)
n_decode += 1
n_cur += 1
@@ -210,7 +210,7 @@ actor LlamaContext {
let n_tokens = pp
for i in 0..<n_tokens {
llama_batch_add(&batch, 0, Int32(i), [0], false)
llama_batch_add(&batch, 0, llama_pos(i), [0], false)
}
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
@@ -221,7 +221,6 @@ actor LlamaContext {
if llama_decode(context, batch) != 0 {
print("llama_decode() failed during prompt")
}
llama_synchronize(context)
let t_pp_end = ggml_time_us()
@@ -235,13 +234,12 @@ actor LlamaContext {
llama_batch_clear(&batch)
for j in 0..<pl {
llama_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
llama_batch_add(&batch, 0, llama_pos(i), [Int32(j)], true)
}
if llama_decode(context, batch) != 0 {
print("llama_decode() failed during text generation")
}
llama_synchronize(context)
}
let t_tg_end = ggml_time_us()

View File

@@ -1,13 +1,11 @@
# MobileVLM
Currently this implementation supports [MobileVLM-1.7B](https://huggingface.co/mtgv/MobileVLM-1.7B) / [MobileVLM_V2-1.7B](https://huggingface.co/mtgv/MobileVLM_V2-1.7B) variants.
Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
## Usage
Build with cmake or run `make llava-cli` to build it.
@@ -36,7 +34,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
```
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
```sh
python ./examples/llava/convert-image-encoder-to-gguf \
@@ -46,14 +44,6 @@ python ./examples/llava/convert-image-encoder-to-gguf \
--projector-type ldp
```
```sh
python ./examples/llava/convert-image-encoder-to-gguf \
-m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
--output-dir path/to/MobileVLM-1.7B_V2 \
--projector-type ldpv2
```
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
@@ -78,7 +68,7 @@ cd examples/llava/android/build_64
### run on Android
refer to `android/adb_run.sh`, modify resources' `name` and `path`
## Some result on Android with `Snapdragon 888` chip
## some result on Android with `Snapdragon 888` chip
### case 1
**input**
```sh
@@ -109,6 +99,7 @@ llama_print_timings: total time = 34731.93 ms
--image /data/local/tmp/cat.jpeg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
```
**output**
```sh
encode_image_with_clip: image encoded in 21149.51 ms by CLIP ( 146.87 ms per image patch)
@@ -120,82 +111,12 @@ llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 m
llama_print_timings: total time = 34570.79 ms
```
## Some result on Android with `Snapdragon 778G` chip
### MobileVLM-1.7B case
#### llava-cli release-b2005
**input**
```sh
/data/local/tmp/llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
--image /data/local/tmp/many_llamas.jpeg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:"
```
**output**
```sh
encode_image_with_clip: image encoded in 18728.52 ms by CLIP ( 130.06 ms per image patch)
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
user_prompt: \nWhat's that? ASSISTANT:
A group of llamas are standing in a green pasture.
llama_print_timings: load time = 20357.33 ms
llama_print_timings: sample time = 2.96 ms / 14 runs ( 0.21 ms per token, 4734.53 tokens per second)
llama_print_timings: prompt eval time = 8119.49 ms / 191 tokens ( 42.51 ms per token, 23.52 tokens per second)
llama_print_timings: eval time = 1005.75 ms / 14 runs ( 71.84 ms per token, 13.92 tokens per second)
llama_print_timings: total time = 28038.34 ms / 205 tokens
```
#### llava-cli latest-version
**input**
Just the same as above.
**output**(seems to be much slower)
```sh
encode_image_with_clip: image embedding created: 144 tokens
encode_image_with_clip: image encoded in 288268.88 ms by CLIP ( 2001.87 ms per image patch)
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
user_prompt: \nWhat's that? ASSISTANT:
It is a group of sheep standing together in a grass field.
llama_print_timings: load time = 818120.91 ms
llama_print_timings: sample time = 3.44 ms / 14 runs ( 0.25 ms per token, 4067.40 tokens per second)
llama_print_timings: prompt eval time = 529274.69 ms / 191 tokens ( 2771.07 ms per token, 0.36 tokens per second)
llama_print_timings: eval time = 43894.02 ms / 13 runs ( 3376.46 ms per token, 0.30 tokens per second)
llama_print_timings: total time = 865441.76 ms / 204 tokens
```
### MobileVLM_V2-1.7B case
#### llava-cli release-2005b
**input**
Just the same as above.
**output**
```sh
encode_image_with_clip: image encoded in 20609.61 ms by CLIP ( 143.12 ms per image patch)
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
user_prompt: \nWhat's that? ASSISTANT:
This image captures a lively scene of 20 llamas in motion on an expansive, grassy field. The llama is scattered across the landscape with some standing and others sitting down as if taking rest or observing their surroundings from different vantage points within this verdant setting.
The background offers glimpses into a picturesque town nestled amidst hills under an overcast sky, adding depth to the scene while also emphasizing that distance between these llama and human-made structures like houses or roads in which they roam freely without any barriers around them. The image is framed by text at both right angles on white backgrounds against a contrasting blue backdrop with green foliage, further drawing attention to the llamas amidst their natural habitat while also inviting viewers into this picturesque landscape within town limits of Alta Llama
llama_print_timings: load time = 22406.77 ms
llama_print_timings: sample time = 49.26 ms / 186 runs ( 0.26 ms per token, 3776.27 tokens per second)
llama_print_timings: prompt eval time = 9044.54 ms / 191 tokens ( 47.35 ms per token, 21.12 tokens per second)
llama_print_timings: eval time = 14497.49 ms / 186 runs ( 77.94 ms per token, 12.83 tokens per second)
llama_print_timings: total time = 44411.01 ms / 377 tokens
```
## Orin compile and run
### compile
```sh
make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
```
### run on Orin
### case 1
**input**
@@ -244,121 +165,8 @@ llama_print_timings: eval time = 166.65 ms / 11 runs ( 15.15 m
llama_print_timings: total time = 1365.47 ms / 243 tokens
```
## Running on Intel(R) Core(TM) i7-10750H
### Operating system
Ubuntu22.04
### compile
```sh
make -j32
```
### MobileVLM-1.7B case
**input**
```sh
-m /path/to/ggml-model-q4_k.gguf \
--mmproj /path/to/mmproj-model-f16.gguf \
--image /path/to/many_llamas.jpeg
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
```
**output**
```sh
encode_image_with_clip: image embedding created: 144 tokens
encode_image_with_clip: image encoded in 2730.94 ms by CLIP ( 18.96 ms per image patch)
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
user_prompt: \nWhat's that?ASSISTANT:
A group of llamas are walking together in a field.
llama_print_timings: load time = 5506.60 ms
llama_print_timings: sample time = 0.44 ms / 13 runs ( 0.03 ms per token, 29545.45 tokens per second)
llama_print_timings: prompt eval time = 2031.58 ms / 190 tokens ( 10.69 ms per token, 93.52 tokens per second)
llama_print_timings: eval time = 438.92 ms / 12 runs ( 36.58 ms per token, 27.34 tokens per second)
llama_print_timings: total time = 5990.25 ms / 202 tokens
```
### MobileVLM_V2-1.7B case
**input**
Just the same as above.
**ouput**
```sh
encode_image_with_clip: image embedding created: 144 tokens
encode_image_with_clip: image encoded in 3223.89 ms by CLIP ( 22.39 ms per image patch)
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
user_prompt: \nWhat's that?ASSISTANT:
The image captures a tranquil scene in a park, where a group of approximately 20 llamas are gathered. The llamas, a mix of white and black, are standing in a line, their black and white patterns contrasting with the lush green grass of the park. The lamas are arranged in a line, suggesting a social order.
The park itself is lush and green, with trees dotting the landscape in the background. A sign reading "Llamas Tico Ana" is also visible in the image, possibly indicating the location or the breed of the llamas. The image seems to be taken from a distance, providing a wide view of the scene and the surrounding environment.
The llamas' positions relative to each other, the sign, and the trees create a harmonious composition. The image does not contain any discernible text. The overall scene is one of peace and natural beauty, with the llamas in their natural habitat, surrounded by the vibrant colors and lush greenery of the park.
llama_print_timings: load time = 6642.61 ms
llama_print_timings: sample time = 8.15 ms / 223 runs ( 0.04 ms per token, 27358.61 tokens per second)
llama_print_timings: prompt eval time = 2475.07 ms / 190 tokens ( 13.03 ms per token, 76.77 tokens per second)
llama_print_timings: eval time = 8760.60 ms / 222 runs ( 39.46 ms per token, 25.34 tokens per second)
llama_print_timings: total time = 15513.95 ms / 412 tokens
```
## Run on Intel(R) Core(TM) Ultra7 115H
### operation system
Windows11
### comiple
```sh
make -j32
```
### MobileVLM-1.7B case
**input**
```sh
-m /path/to/ggml-model-q4_k.gguf \
--mmproj /path/to/tmp/mmproj-model-f16.gguf \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
```
**output**
```sh
encode_image_with_clip: image encoded in 4902.81 ms by CLIP ( 34.05 ms per image patch)
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
user_prompt: \nWhat's that? ASSISTANT:
The image features a group of brown and white llamas standing in a grassy field.
llama_print_timings: load time = 7441.06 ms
llama_print_timings: sample time = 0.72 ms / 19 runs ( 0.04 ms per token, 26279.39 tokens per second)
llama_print_timings: prompt eval time = 2090.71 ms / 191 tokens ( 10.95 ms per token, 91.36 tokens per second)
llama_print_timings: eval time = 512.35 ms / 18 runs ( 28.46 ms per token, 35.13 tokens per second)
llama_print_timings: total time = 7987.23 ms / 209 tokens
```
### MobileVLM_V2-1.7B case
**input**
Just the same as above.
**output**
```sh
encode_image_with_clip: image encoded in 4682.44 ms by CLIP ( 32.52 ms per image patch)
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
user_prompt: \nWhat's that? ASSISTANT:
This image captures a lively scene of a group of 14 llamas in a grassy field. The llamas, with their distinctive black and white coats, are standing and walking in a line, seemingly engaged in a social activity. One
of them, possibly the first in the line, has its back turned, perhaps observing something in the distance.
The llama in the front of the line stands out due to its black and white coloring, which is quite unusual for llama patterns. The llama in the front also seems to be more aware of its surroundings, as it faces the camera, giving a sense of engagement with the viewer.
The image is taken from the side of the llama, providing a clear view of the llama in the front and its companions. The lameness in the llama in
front is not visible, indicating that it might not be the main focus of the photo.
The background of the image features a grassy field, with a fence and a tree visible in the distance. The tree appears to be bare, suggesting that it might be during a time of year when most trees are dormant or have shed their leaves.
llama_print_timings: load time = 7015.35 ms
llama_print_timings: sample time = 10.61 ms / 256 runs ( 0.04 ms per token, 24119.09 tokens per second)
llama_print_timings: prompt eval time = 2052.45 ms / 191 tokens ( 10.75 ms per token, 93.06 tokens per second)
llama_print_timings: eval time = 7259.43 ms / 255 runs ( 28.47 ms per token, 35.13 tokens per second)
llama_print_timings: total time = 14371.19 ms / 446 tokens
```
## Minor shortcomings
The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
## TODO
@@ -373,5 +181,5 @@ llama_print_timings: total time = 14371.19 ms / 446 tokens
## contributor
```sh
zhangjidong05, yangyang260, huyiming03, chenxiaotao03, ZiangWu-77
zhangjidong05, yangyang260, huyiming03, chenxiaotao03
```

View File

@@ -63,20 +63,12 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
```console
git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
```
2) Install the required Python packages:
```sh
pip install -r examples/llava/requirements.txt
```
3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
```console
python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
```
- you will find a llava.projector and a llava.clip file in your model directory
4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
```console
mkdir vit
cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
@@ -84,18 +76,18 @@ cp ../llava-v1.6-vicuna-7b/llava.projector vit/
curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
```
5) Create the visual gguf model:
4) Create the visual gguf model:
```console
python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
```
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
6) Then convert the model to gguf format:
5) Then convert the model to gguf format:
```console
python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
```
7) And finally we can run the llava-cli using the 1.6 model version:
6) And finally we can run the llava-cli using the 1.6 model version:
```console
./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
```

View File

@@ -7,7 +7,7 @@
#include "ggml-alloc.h"
#include "ggml-backend.h"
#ifdef GGML_USE_CUDA
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#endif
@@ -119,7 +119,6 @@ static std::string format(const char * fmt, ...) {
#define TN_LLAVA_PROJ "mm.%d.%s"
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
#define TN_IMAGE_NEWLINE "model.image_newline"
@@ -127,14 +126,12 @@ enum projector_type {
PROJECTOR_TYPE_MLP,
PROJECTOR_TYPE_MLP_NORM,
PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_LDPV2,
PROJECTOR_TYPE_UNKNOWN,
};
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_MLP, "mlp" },
{ PROJECTOR_TYPE_LDP, "ldp" },
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
};
@@ -478,14 +475,6 @@ struct clip_vision_model {
struct ggml_tensor * mm_model_block_2_block_2_0_w;
struct ggml_tensor * mm_model_block_2_block_2_1_w;
struct ggml_tensor * mm_model_block_2_block_2_1_b;
// MobileVLM_V2 projection
struct ggml_tensor * mm_model_mlp_0_w;
struct ggml_tensor * mm_model_mlp_0_b;
struct ggml_tensor * mm_model_mlp_2_w;
struct ggml_tensor * mm_model_mlp_2_b;
struct ggml_tensor * mm_model_peg_0_w;
struct ggml_tensor * mm_model_peg_0_b;
};
struct clip_ctx {
@@ -508,6 +497,7 @@ struct clip_ctx {
// memory buffers to evaluate the model
ggml_backend_buffer_t params_buffer = NULL;
ggml_backend_buffer_t compute_buffer = NULL;
ggml_backend_t backend = NULL;
ggml_gallocr_t compute_alloc = NULL;
@@ -818,30 +808,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
}
embeddings = block_1;
}
else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
{
int n_patch = 24;
struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
mlp_0 = ggml_gelu(ctx0, mlp_0);
struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
// mlp_2 ne = [2048, 576, 1, 1]
// // AVG Pool Layer 2*2, strides = 2
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
// mlp_2 ne = [576, 2048, 1, 1]
mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
// mlp_2 ne [24, 24, 2048, 1]
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
// weight ne = [3, 3, 2048, 1]
struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
peg_0 = ggml_add(ctx0, peg_0, mlp_2);
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
embeddings = peg_0;
}
else {
GGML_ASSERT(false);
}
@@ -969,7 +935,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
}
#ifdef GGML_USE_CUDA
#ifdef GGML_USE_CUBLAS
new_clip->backend = ggml_backend_cuda_init(0);
printf("%s: CLIP using CUDA backend\n", __func__);
#endif
@@ -1029,7 +995,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
if (!new_clip->ctx_data) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
clip_free(new_clip);
gguf_free(ctx);
return nullptr;
}
@@ -1037,7 +1002,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
if (!fin) {
printf("cannot open model file for loading tensors\n");
clip_free(new_clip);
gguf_free(ctx);
return nullptr;
}
@@ -1059,7 +1023,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
if (!fin) {
printf("%s: failed to seek for tensor %s\n", __func__, name);
clip_free(new_clip);
gguf_free(ctx);
return nullptr;
}
int num_bytes = ggml_nbytes(cur);
@@ -1212,18 +1175,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
}
else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2)
{
// MobilVLM_V2 projection
vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight"));
vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias"));
vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight"));
vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias"));
vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
}
else {
} else {
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
}
@@ -1280,16 +1232,16 @@ struct clip_image_f32 * clip_image_f32_init() {
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) {
if (batch->size > 0) {
delete[] batch->data;
batch->size = 0;
void clip_image_u8_batch_free(struct clip_image_u8_batch & batch) {
if (batch.size > 0) {
delete[] batch.data;
batch.size = 0;
}
}
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
if (batch->size > 0) {
delete[] batch->data;
batch->size = 0;
void clip_image_f32_batch_free(struct clip_image_f32_batch & batch) {
if (batch.size > 0) {
delete[] batch.data;
batch.size = 0;
}
}
@@ -1542,7 +1494,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {
bool pad_to_square = true;
if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n");
@@ -1554,11 +1506,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
pad_to_square = false;
}
// free the previous res_imgs if any set
if (res_imgs->size > 0) {
if (res_imgs.size > 0) {
clip_image_f32_batch_free(res_imgs);
}
res_imgs->data = nullptr;
res_imgs->size = 0;
res_imgs.data = nullptr;
res_imgs.size = 0;
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
@@ -1613,11 +1565,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
patches.insert(patches.begin(), image_original_resize);
// clip_image_f32_batch_init(patches.size());
res_imgs->size = patches.size();
res_imgs->data = new clip_image_f32[res_imgs->size];
res_imgs.size = patches.size();
res_imgs.data = new clip_image_f32[res_imgs.size];
int num=0;
for (auto& patch : patches) {
normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std);
num++;
}
@@ -1705,9 +1657,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
// }
// res_imgs.push_back(res);
res_imgs->size = 1;
res_imgs->data = new clip_image_f32[res_imgs->size];
res_imgs->data[0] = *res;
res_imgs.size = 1;
res_imgs.data = new clip_image_f32[res_imgs.size];
res_imgs.data[0] = *res;
clip_image_f32_free(res);
return true;
@@ -1721,9 +1673,6 @@ void clip_free(clip_ctx * ctx) {
ggml_free(ctx->ctx_data);
gguf_free(ctx->ctx_gguf);
ggml_backend_buffer_free(ctx->params_buffer);
ggml_backend_free(ctx->backend);
ggml_gallocr_free(ctx->compute_alloc);
delete ctx;
}
@@ -1756,7 +1705,7 @@ int clip_n_patches(const struct clip_ctx * ctx) {
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
n_patches /= 4;
}
@@ -1913,6 +1862,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
std::vector<uint8_t> work(512);
std::vector<float> conv_buf(512);
std::vector<int64_t> hist_all(1 << 4, 0);
size_t total_size_org = 0;
size_t total_size_new = 0;
@@ -1959,7 +1909,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
break;
default:
printf("Please use an input file in f32 or f16\n");
gguf_free(ctx_out);
return false;
}
@@ -1968,7 +1917,48 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
}
new_data = work.data();
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
std::vector<int64_t> hist_cur(1 << 4, 0);
switch (new_type) {
case GGML_TYPE_Q4_0: {
new_size = ggml_quantize_q4_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_1: {
new_size = ggml_quantize_q4_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_0: {
new_size = ggml_quantize_q5_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_1: {
new_size = ggml_quantize_q5_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q8_0: {
new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q2_K: {
new_size = ggml_quantize_q2_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q3_K: {
new_size = ggml_quantize_q3_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_K: {
new_size = ggml_quantize_q4_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_K: {
new_size = ggml_quantize_q5_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q6_K: {
new_size = ggml_quantize_q6_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
default: {
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
return false;
}
}
for (size_t j = 0; j < hist_cur.size(); ++j) {
hist_all[j] += hist_cur[j];
}
} else {
new_type = cur->type;
new_data = cur->data;
@@ -2003,6 +1993,17 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
{
printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
int64_t sum_all = 0;
for (size_t i = 0; i < hist_all.size(); ++i) {
sum_all += hist_all[i];
}
printf("%s: hist: ", __func__);
for (size_t i = 0; i < hist_all.size(); ++i) {
printf("%5.3f ", hist_all[i] / (float)sum_all);
}
printf("\n");
}
return true;
@@ -2012,9 +2013,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
}
if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
return ctx->vision_model.mm_model_peg_0_b->ne[0];
}
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
return ctx->vision_model.mm_2_b->ne[0];
}

View File

@@ -60,8 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init();
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch & batch);
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch);
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
@@ -69,7 +69,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs );
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);

View File

@@ -1,7 +1,6 @@
import argparse
import os
import json
import re
import torch
import numpy as np
@@ -39,11 +38,9 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b
def get_tensor_name(name: str) -> str:
if "projection" in name:
return name
if "mm_projector" in name:
name = name.replace("model.mm_projector", "mm")
name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
return name
return name.replace("model.mm_projector", "mm")
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
@@ -86,7 +83,7 @@ ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
help="The clip model is from openclip (for ViT-SO400M type))")
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5

View File

@@ -152,7 +152,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
if (newline_tmp->backend != GGML_BACKEND_CPU) {
if (newline_tmp->buffer == NULL) {
printf("newline_tmp tensor buffer is NULL\n");
}
@@ -223,7 +223,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
clip_image_f32_batch img_res_v;
img_res_v.size = 0;
img_res_v.data = nullptr;
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
delete[] img_res_v.data;
return false;
@@ -338,7 +338,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
if (n_eval > n_batch) {
n_eval = n_batch;
}
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, (float) *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;

View File

@@ -29,9 +29,9 @@ struct llava_image_embed {
};
/** sanity check for clip <-> llava embed size match */
LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
/** build an image embed from image file bytes */
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);

View File

@@ -3,21 +3,3 @@ add_executable(${TARGET} lookup.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TARGET lookup-create)
add_executable(${TARGET} lookup-create.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TARGET lookup-merge)
add_executable(${TARGET} lookup-merge.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TARGET lookup-stats)
add_executable(${TARGET} lookup-stats.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,43 +0,0 @@
#include "ggml.h"
#include "llama.h"
#include "common.h"
#include "ngram-cache.h"
#include <cstdint>
#include <fstream>
#include <iostream>
#include <string>
#include <unordered_map>
#include <vector>
int main(int argc, char ** argv){
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
llama_model * model = NULL;
llama_context * ctx = NULL;
// load the model
std::tie(model, ctx) = llama_init_from_gpt_params(params);
GGML_ASSERT(model != nullptr);
// tokenize the prompt
const bool add_bos = llama_should_add_bos_token(model);
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
fprintf(stderr, "%s: tokenization done\n", __func__);
llama_ngram_cache ngram_cache;
llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
}

View File

@@ -1,47 +0,0 @@
#include "ggml.h"
#include "llama.h"
#include "common.h"
#include "ngram-cache.h"
#include <cstdint>
#include <cstdio>
#include <fstream>
#include <iostream>
#include <string>
#include <unordered_map>
#include <vector>
static void print_usage() {
fprintf(stderr, "Merges multiple lookup cache files into a single one.\n");
fprintf(stderr, "Usage: lookup-merge [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n");
}
int main(int argc, char ** argv){
if (argc < 3) {
print_usage();
exit(1);
}
std::vector<std::string> args;
args.resize(argc-1);
for (int i = 0; i < argc-1; ++i) {
args[i] = argv[i+1];
if (args[i] == "-h" || args[i] == "--help") {
print_usage();
exit(0);
}
}
fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
for (size_t i = 1; i < args.size()-1; ++i) {
fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
}
fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
llama_ngram_cache_save(ngram_cache_merged, args.back());
}

View File

@@ -1,163 +0,0 @@
#include "ggml.h"
#include "common.h"
#include "llama.h"
#include "log.h"
#include "ngram-cache.h"
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <fstream>
#include <string>
#include <vector>
#include <unordered_map>
int main(int argc, char ** argv){
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}
const int n_draft = params.n_draft;
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
llama_model * model = NULL;
llama_context * ctx = NULL;
// load the model
std::tie(model, ctx) = llama_init_from_gpt_params(params);
llama_set_rng_seed(ctx, params.seed);
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
// tokenize the prompt
const bool add_bos = llama_should_add_bos_token(model);
LOG("add_bos tgt: %d\n", add_bos);
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
llama_ngram_cache ngram_cache_context;
llama_ngram_cache ngram_cache_dynamic;
llama_ngram_cache ngram_cache_static;
int64_t t_draft_flat_us = 0;
int64_t t_draft_us = 0;
{
const int64_t t_start_draft_us = ggml_time_us();
if (!params.lookup_cache_static.empty()) {
try {
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
} catch (std::ifstream::failure const &) {
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
exit(1);
}
}
if (!params.lookup_cache_dynamic.empty()) {
try {
ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
}
t_draft_flat_us += ggml_time_us() - t_start_draft_us;
}
const int n_input = inp.size();
const int n_ctx = params.n_ctx;
int n_drafted = 0;
int n_accept = 0;
const int64_t t_start_ms = ggml_time_ms();
// Iterate over input tokens in chunks of size n_ctx.
// Each chunk is treated as if a sequential generation but with pre-determined tokens to ensure reproducibility.
for (int i_start = 0; i_start + n_ctx < n_input; i_start += n_ctx) {
const std::vector<llama_token> inp_slice(inp.begin() + i_start, inp.begin() + i_start + n_ctx);
std::vector<llama_token> pseudo_output;
pseudo_output.push_back(inp_slice[0]);
while ((int) pseudo_output.size() < n_ctx) {
// Simulate drafting and decoding from draft:
std::vector<llama_token> draft;
draft.push_back(pseudo_output.back());
{
const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
t_draft_us += ggml_time_us() - t_start_draft_us;
}
n_drafted += draft.size() - 1;
for (size_t j = 1; j < draft.size() && (int) pseudo_output.size() < n_ctx; ++j) {
const llama_token ground_truth = inp_slice[pseudo_output.size()];
const llama_token drafted = draft[j];
if (ground_truth != drafted) {
break;
}
++n_accept;
pseudo_output.push_back(ground_truth);
{
const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
t_draft_us += ggml_time_us() - t_start_draft_us;
}
}
// After each simulated batch decoding simulate the sampling of a single token:
if ((int) pseudo_output.size() < n_ctx) {
pseudo_output.push_back(inp_slice[pseudo_output.size()]);
{
const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
t_draft_us += ggml_time_us() - t_start_draft_us;
}
}
draft.erase(draft.begin());
}
if (i_start > 0 && i_start / 100000 != (i_start - n_ctx) / 100000) {
const int64_t t_now_ms = ggml_time_ms();
const int64_t eta_ms = (n_input - i_start) * (t_now_ms - t_start_ms) / i_start;
const int64_t eta_min = eta_ms / (60*1000);
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
}
// After each chunk, update the dynamic ngram cache with the context ngram cache:
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
ngram_cache_context.clear();
}
LOG_TEE("\n");
LOG_TEE("\n");
LOG_TEE("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx);
LOG_TEE("n_drafted = %d\n", n_drafted);
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
LOG_TEE("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
fprintf(stderr, "\n\n");
return 0;
}

View File

@@ -1,15 +1,12 @@
#include "common.h"
#include "ggml.h"
#include "llama.h"
#include "common.h"
#include "ngram-cache.h"
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <fstream>
#include <string>
#include <vector>
#include <unordered_map>
int main(int argc, char ** argv){
gpt_params params;
@@ -18,7 +15,11 @@ int main(int argc, char ** argv){
return 1;
}
// max. number of additional tokens to draft if match is found
// max/min n-grams size to search for in prompt
const int ngram_max = 4;
const int ngram_min = 1;
// length of the candidate / draft sequence, if match is found
const int n_draft = params.n_draft;
const bool dump_kv_cache = params.dump_kv_cache;
@@ -38,8 +39,6 @@ int main(int argc, char ** argv){
// load the model
std::tie(model, ctx) = llama_init_from_gpt_params(params);
llama_set_rng_seed(ctx, params.seed);
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
// tokenize the prompt
const bool add_bos = llama_should_add_bos_token(model);
@@ -48,35 +47,6 @@ int main(int argc, char ** argv){
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
llama_ngram_cache ngram_cache_context;
llama_ngram_cache ngram_cache_dynamic;
llama_ngram_cache ngram_cache_static;
int64_t t_draft_flat_us = 0;
int64_t t_draft_us = 0;
{
// Fill up context ngram cache with tokens from user input:
const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
if (!params.lookup_cache_static.empty()) {
try {
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
} catch (std::ifstream::failure const &) {
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
exit(1);
}
}
if (!params.lookup_cache_dynamic.empty()) {
try {
ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
}
t_draft_flat_us += ggml_time_us() - t_start_draft_us;
}
const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4;
@@ -106,6 +76,8 @@ int main(int argc, char ** argv){
int n_drafted = 0;
int n_accept = 0;
int64_t t_draft_us = 0;
int n_past = inp.size();
bool has_eos = false;
@@ -157,12 +129,6 @@ int main(int argc, char ** argv){
++n_past;
++i_dft;
inp.push_back(id);
{
// Update context ngram cache with the newly accepted token:
const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
t_draft_us += ggml_time_us() - t_start_draft_us;
}
if (params.use_color) {
// color accepted draft token
@@ -183,12 +149,6 @@ int main(int argc, char ** argv){
draft.clear();
draft.push_back(id);
inp.push_back(id);
{
// Update context ngram cache with the newly accepted token:
const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
t_draft_us += ggml_time_us() - t_start_draft_us;
}
break;
}
@@ -203,19 +163,44 @@ int main(int argc, char ** argv){
llama_batch_clear(batch_tgt);
llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
// Draft already contains a single token sampled from the model:
GGML_ASSERT(draft.size() == 1);
GGML_ASSERT(draft[0] == inp.back());
// generate n_pred tokens through prompt lookup
auto prompt_lookup = [&]() -> void {
const int inp_size = inp.size();
for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
const llama_token * ngram = &inp[inp_size - ngram_size];
for (int i = 0; i <= (int) inp_size - (ngram_size * 2); ++i) {
bool match = true;
for (int j = 0; j < ngram_size; ++j) {
if (inp[i + j] != ngram[j]) {
match = false;
break;
}
}
if (match) {
const int startIdx = i + ngram_size;
const int endIdx = startIdx + n_draft;
if (endIdx < inp_size) {
for (int j = startIdx; j < endIdx; ++j) {
LOG(" - draft candidate %d: %d\n", j, inp[j]);
draft.push_back(inp[j]);
llama_batch_add(batch_tgt, inp[j], n_past + (j - startIdx) + 1, { 0 }, true);
++n_drafted;
}
return;
}
}
}
}
return;
};
const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
for (size_t i = 1; i < draft.size(); ++i) {
llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
}
prompt_lookup();
t_draft_us += ggml_time_us() - t_start_draft_us;
n_drafted += draft.size() - 1;
llama_decode(ctx, batch_tgt);
++n_past;
@@ -225,24 +210,19 @@ int main(int argc, char ** argv){
auto t_dec_end = ggml_time_us();
// Update dynamic ngram cache with context ngram cache and save it to disk:
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
LOG_TEE("\n\n");
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_TEE("\n");
LOG_TEE("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_predict);
LOG_TEE("n_drafted = %d\n", n_drafted);
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
LOG_TEE("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_predict);
LOG_TEE("n_drafted = %d\n", n_drafted);
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
LOG_TEE("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_TEE("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_TEE("\ntarget:\n");
llama_print_timings(ctx);

View File

@@ -8,7 +8,7 @@ Because this example is "outside of the source tree", it is important to first b
### Considerations
When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
### Build llama.cpp and install to C:\LlamaCPP directory

View File

@@ -67,7 +67,6 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
@@ -296,9 +295,7 @@ These options help improve the performance and memory usage of the LLaMA models.
### Batch Size
- `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
- `-ub N`, `--ubatch-size N`: physical maximum batch size. This is for pipeline parallelization. Default: `512`.
- `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
### Prompt Caching
@@ -318,8 +315,8 @@ These options provide extra functionality and customization when running the LLa
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
- `--verbose-prompt`: Print the prompt before generating text.
- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

View File

@@ -511,14 +511,6 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;
// tokenized antiprompts
std::vector<std::vector<llama_token>> antiprompt_ids;
antiprompt_ids.reserve(params.antiprompt.size());
for (const std::string & antiprompt : params.antiprompt) {
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
}
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
@@ -556,8 +548,8 @@ int main(int argc, char ** argv) {
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
n_past -= n_discard;
@@ -584,9 +576,9 @@ int main(int argc, char ** argv) {
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
llama_kv_cache_seq_shift(ctx, 0, ga_i, n_past, ib*bd);
llama_kv_cache_seq_div (ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
llama_kv_cache_seq_shift(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
n_past -= bd;
@@ -777,18 +769,6 @@ int main(int argc, char ** argv) {
}
}
// check for reverse prompt using special tokens
llama_token last_token = llama_sampling_last(ctx_sampling);
for (std::vector<llama_token> ids : antiprompt_ids) {
if (ids.size() == 1 && last_token == ids[0]) {
if (params.interactive) {
is_interacting = true;
}
is_antiprompt = true;
break;
}
}
if (is_antiprompt) {
LOG("found antiprompt: %s\n", last_output.c_str());
}
@@ -878,7 +858,6 @@ int main(int argc, char ** argv) {
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());

View File

@@ -107,9 +107,6 @@ int main(int argc, char ** argv) {
// number of simultaneous "clients" to simulate
const int32_t n_clients = params.n_parallel;
// dedicate one sequence to the system prompt
params.n_parallel += 1;
// requests to simulate
const int32_t n_seq = params.n_sequences;
@@ -132,6 +129,7 @@ int main(int argc, char ** argv) {
llama_context * ctx = NULL;
// load the target model
params.logits_all = true;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
// load the prompts from an external file if there are any
@@ -198,8 +196,8 @@ int main(int argc, char ** argv) {
}
// assign the system KV cache to all parallel sequences
for (int32_t i = 1; i <= n_clients; ++i) {
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
for (int32_t i = 1; i < n_clients; ++i) {
llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
}
LOG_TEE("\n");
@@ -223,17 +221,15 @@ int main(int argc, char ** argv) {
client.i_batch = batch.n_tokens;
llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true);
client.n_decoded += 1;
}
if (batch.n_tokens == 0) {
// all sequences have ended - clear the entire KV cache
for (int i = 1; i <= n_clients; ++i) {
llama_kv_cache_seq_rm(ctx, i, -1, -1);
// but keep the system prompt
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
for (int i = 0; i < n_clients; ++i) {
llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1);
}
LOG_TEE("%s: clearing the KV cache\n", __func__);
@@ -259,7 +255,7 @@ int main(int argc, char ** argv) {
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
}
// extract the logits only for the last token
@@ -370,8 +366,7 @@ int main(int argc, char ** argv) {
}
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1);
const auto t_main_end = ggml_time_us();

View File

@@ -126,7 +126,7 @@ int main(int argc, char ** argv) {
const int n_batch = ctx_params.n_batch;
const int n_batch_grp = ctx_params.n_batch/n_grp;
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch);
// print the prompt token-by-token
@@ -146,11 +146,10 @@ int main(int argc, char ** argv) {
const int ib = i/n_batch - 1;
const int bd = n_batch_grp*(n_grp - 1);
llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
llama_kv_cache_update (ctx);
llama_kv_cache_seq_shift(ctx, 0, n_past - n_batch, n_past, ib*bd);
llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
n_past -= bd;
}
llama_batch_clear(batch);
@@ -180,12 +179,10 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
//llama_kv_cache_defrag (ctx);
llama_kv_cache_update (ctx);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
n_past -= n_discard;
llama_batch_clear(batch);
@@ -211,12 +208,10 @@ int main(int argc, char ** argv) {
if (n_discard > 0) {
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
//llama_kv_cache_defrag (ctx);
llama_kv_cache_update (ctx);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
n_past -= n_discard;
}
}

View File

@@ -380,7 +380,6 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
const int batch_size = std::min(end - batch_start, n_batch);
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
// TODO: use llama_batch.logits instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
//fprintf(stderr, "%s : failed to eval\n", __func__);
return {tokens, -1, logit_history, prob_history};
@@ -443,7 +442,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
return {tokens, std::exp(nll / count), logit_history, prob_history};
}
static results_perplexity perplexity(llama_context * ctx, const gpt_params & params, const int32_t n_ctx) {
static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
if (params.ppl_stride > 0) {
return perplexity_v2(ctx, params);
}
@@ -454,6 +453,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// BOS tokens will be added for each chunk before eval
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
std::ofstream logits_stream;
if (!params.logits_file.empty()) {
@@ -499,19 +499,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
double nll2 = 0.0;
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
const int n_seq = std::max(1, n_batch / n_ctx);
GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0);
GGML_ASSERT(params.n_ctx == n_seq * n_ctx);
llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);
std::vector<float> logits;
if (num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab);
}
fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
@@ -524,26 +518,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
log_probs.resize(n_ctx * nv);
}
// We get the logits for all the tokens in the context window (params.n_ctx)
// from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
// calculate the perplexity over the last half of the window (so the model always has
// some context to predict the token).
//
// We rely on the fact that attention in the forward pass only looks at previous
// tokens here, so the logits returned for each token are an accurate representation
// of what the model would have predicted at that point.
//
// Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt.
const int first = n_ctx/2;
for (int i = 0; i < n_chunk; i += n_seq) {
for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx;
const int end = start + n_ctx;
const int n_seq_batch = std::min(n_seq, n_chunk - i);
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
@@ -553,54 +531,34 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
int n_outputs = 0;
// save original token and restore it after eval
const auto token_org = tokens[batch_start];
batch.n_tokens = 0;
for (int seq = 0; seq < n_seq_batch; seq++) {
int seq_start = batch_start + seq*n_ctx;
// save original token and restore it after eval
const auto token_org = tokens[seq_start];
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[seq_start] = llama_token_bos(llama_get_model(ctx));
}
for (int k = 0; k < batch_size; ++k) {
const int idx = seq*n_ctx + k;
batch.token [idx] = tokens[seq_start + k];
batch.pos [idx] = j*n_batch + k;
batch.n_seq_id[idx] = 1;
batch.seq_id [idx][0] = seq;
batch.logits [idx] = batch.pos[idx] >= first ? 1 : 0;
n_outputs += batch.logits[idx] != 0;
}
batch.n_tokens += batch_size;
// restore the original token in case it was set to BOS
tokens[seq_start] = token_org;
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}
if (llama_decode(ctx, batch)) {
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return {tokens, -1, logit_history, prob_history};
}
if (num_batches > 1 && n_outputs > 0) {
// restore the original token in case it was set to BOS
tokens[batch_start] = token_org;
if (num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
}
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
llama_synchronize(ctx);
const auto t_end = std::chrono::high_resolution_clock::now();
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total*n_chunk/n_seq);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
@@ -608,32 +566,37 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
}
for (int seq = 0; seq < n_seq_batch; seq++) {
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
// We get the logits for all the tokens in the context window (params.n_ctx)
// from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
// calculate the perplexity over the last half of the window (so the model always has
// some context to predict the token).
//
// We rely on the fact that attention in the forward pass only looks at previous
// tokens here, so the logits returned for each token are an accurate representation
// of what the model would have predicted at that point.
//
// Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt.
const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
if (!params.logits_file.empty()) {
process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, log_probs, nll, nll2);
} else {
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
}
count += n_ctx - first - 1;
llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
if (!params.logits_file.empty()) {
process_logits(logits_stream, n_vocab, all_logits,
tokens_data, n_ctx - 1 - first,
workers, log_probs, nll, nll2);
} else {
process_logits(n_vocab, all_logits,
tokens_data, n_ctx - 1 - first,
workers, nll, nll2,
logit_history.data() + start + seq*n_ctx + first,
prob_history.data() + start + seq*n_ctx + first);
}
count += n_ctx - first - 1;
// perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) {
printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
} else {
double av = nll/count;
double av2 = nll2/count - av*av;
if (av2 > 0) av2 = sqrt(av2/(count-1));
printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
}
// perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) {
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
} else {
double av = nll/count;
double av2 = nll2/count - av*av;
if (av2 > 0) av2 = sqrt(av2/(count-1));
printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
}
fflush(stdout);
@@ -652,13 +615,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
printf("Unexpected negative standard deviation of log(prob)\n");
}
llama_batch_free(batch);
return {tokens, ppl, logit_history, prob_history};
}
static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int32_t n_batch, int32_t n_vocab) {
int prev_outputs = 0;
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
@@ -679,14 +639,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
return false;
}
int n_outputs = 0;
for (int i = 0; i < n_tokens; ++i) {
n_outputs += batch_view.logits[i] != 0;
}
memcpy(batch_logits.data() + prev_outputs*n_vocab, llama_get_logits(ctx), n_outputs*n_vocab*sizeof(float));
prev_outputs += n_outputs;
memcpy(batch_logits.data() + i*n_vocab, llama_get_logits(ctx), n_tokens*n_vocab*sizeof(float));
}
return true;
@@ -793,7 +746,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
size_t ending_logprob_count[4];
double ending_logprob[4];
size_t i_logits; // starting index of logits in the llama_batch
size_t i_batch; // starting index in the llama_batch
size_t common_prefix; // max number of initial tokens that are the same in all sentences
size_t required_tokens; // needed number of tokens to evaluate all 4 endings
std::vector<llama_token> seq_tokens[4];
@@ -856,12 +809,11 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
const int n_batch = params.n_batch;
const int max_tasks_per_batch = 32;
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
const int max_seq = 4*max_tasks_per_batch;
llama_batch batch = llama_batch_init(n_ctx, 0, 4);
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
std::vector<float> tok_logits(n_vocab);
// TODO: this could be made smaller; it's currently the worst-case size
std::vector<float> batch_logits(n_vocab*n_ctx);
std::vector<std::pair<size_t, llama_token>> eval_pairs;
@@ -872,17 +824,16 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
int n_cur = 0;
size_t i1 = i0;
size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
llama_batch_clear(batch);
// batch as much tasks as possible into the available context
// each task has 4 unique sequence ids - one for each ending
// each task has 4 unique seuqnce ids - one for each ending
// the common prefix is shared among the 4 sequences to save tokens
// we extract logits only from the last common token and from all ending tokens of each sequence
while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) {
auto & hs_cur = hs_data[i1];
int n_logits = 0;
const int s0 = 4*(i1 - i0);
if (s0 + 4 > max_seq) {
@@ -890,23 +841,18 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
}
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
n_logits += 1;
for (int s = 0; s < 4; ++s) {
const size_t seq_tokens_size = hs_cur.seq_tokens[s].size();
// TODO: don't evaluate the last token of each sequence
for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
const bool needs_logits = i < seq_tokens_size - 1;
llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
n_logits += needs_logits;
for (size_t i = hs_cur.common_prefix; i < hs_cur.seq_tokens[s].size(); ++i) {
llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, true);
}
}
hs_cur.i_logits = i_logits;
i_logits += n_logits;
hs_cur.i_batch = i_batch;
i_batch += hs_cur.required_tokens;
n_cur += hs_data[i1].required_tokens;
if (++i1 == hs_task_count) {
@@ -932,11 +878,12 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
eval_pairs.clear();
for (size_t i = i0; i < i1; ++i) {
auto & hs_cur = hs_data[i];
size_t li = 1; // skip the last logit of the common prefix (computed separately below)
size_t li = hs_cur.common_prefix;
for (int s = 0; s < 4; ++s) {
for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
eval_pairs.emplace_back(hs_cur.i_logits + li++, hs_cur.seq_tokens[s][j + 1]);
eval_pairs.emplace_back(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]);
}
++li;
}
}
// Then we do the actual calculation
@@ -948,8 +895,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
for (size_t i = i0; i < i1; ++i) {
auto & hs_cur = hs_data[i];
// get the logits of the last token of the common prefix
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*hs_cur.i_logits, n_vocab*sizeof(float));
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(hs_cur.i_batch + hs_cur.common_prefix - 1), n_vocab*sizeof(float));
const auto first_probs = softmax(tok_logits);
@@ -999,7 +945,7 @@ struct winogrande_entry {
std::array<std::string, 2> choices;
int answer;
size_t i_logits;
size_t i_batch;
size_t common_prefix;
size_t required_tokens;
size_t n_base1; // number of tokens for context + choice 1
@@ -1125,7 +1071,6 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
task.common_prefix++;
}
// TODO: the last token of each of the sequences don't need to be evaluated
task.required_tokens = task.common_prefix +
task.seq_tokens[0].size() - task.common_prefix +
task.seq_tokens[1].size() - task.common_prefix;
@@ -1141,12 +1086,11 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
const int n_batch = params.n_batch;
const int max_tasks_per_batch = 128;
const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
const int max_seq = 2*max_tasks_per_batch;
llama_batch batch = llama_batch_init(n_ctx, 0, 2);
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
std::vector<float> tok_logits(n_vocab);
// TODO: this could be made smaller; it's currently the worst-case size
std::vector<float> batch_logits(n_vocab*n_ctx);
std::vector<std::pair<size_t, llama_token>> eval_pairs;
@@ -1160,33 +1104,29 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
int n_cur = 0;
size_t i1 = i0;
size_t i_logits = 0;
size_t i_batch = 0;
llama_batch_clear(batch);
while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
int n_logits = 0;
const int s0 = 2*(i1 - i0);
if (s0 + 2 > max_seq) {
break;
}
for (size_t i = 0; i < data[i1].common_prefix; ++i) {
llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1}, false);
}
batch.logits[batch.n_tokens - 1] = true;
n_logits += 1;
for (int s = 0; s < 2; ++s) {
// TODO: end before the last token, no need to predict past the end of the sequences
for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
n_logits += 1;
}
}
data[i1].i_logits = i_logits;
i_logits += n_logits;
data[i1].i_batch = i_batch;
i_batch += data[i1].required_tokens;
n_cur += data[i1].required_tokens;
if (++i1 == data.size()) {
@@ -1217,16 +1157,15 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
size_t li = n_base1 - task.common_prefix;
size_t li = n_base1 - 1;
for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[0][j+1]);
eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[0][j+1]);
}
const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
// FIXME: this uses the wrong first logits when not skipping the choice word
li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - task.common_prefix;
li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[1][j+1]);
eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[1][j+1]);
}
}
compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
@@ -1315,7 +1254,7 @@ struct multiple_choice_task {
}
// For evaluation
size_t i_logits; // starting index of logits in the llama_batch
size_t i_batch; // starting index in the llama_batch
size_t common_prefix; // max number of initial tokens that are the same in all sentences
size_t required_tokens; // needed number of tokens to evaluate all answers
std::vector<std::vector<llama_token>> seq_tokens;
@@ -1394,7 +1333,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
std::vector<uint32_t> task_pos(n_task);
strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
if (strstream.fail()) {
printf("%s: failed to read task positions from prompt\n", __func__);
printf("%s: failed to raad task positions from prompt\n", __func__);
return;
}
@@ -1475,7 +1414,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
return;
}
} else {
int n_dot = std::max((int) n_task/100, 1);
int n_dot = n_task/100;
int i_task = 0;
for (auto& task : tasks) {
++i_task;
@@ -1499,7 +1438,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
const int n_batch = params.n_batch;
const int max_tasks_per_batch = 32;
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
const int max_seq = 4*max_tasks_per_batch;
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
@@ -1519,18 +1458,17 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
int n_cur = 0;
size_t i1 = i0;
size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
llama_batch_clear(batch);
// batch as much tasks as possible into the available context
// each task has 4 unique sequence ids - one for each ending
// each task has 4 unique seuqnce ids - one for each ending
// the common prefix is shared among the 4 sequences to save tokens
// we extract logits only from the last common token and from all ending tokens of each sequence
int s0 = 0;
while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
auto& cur_task = tasks[i1];
int n_logits = 0;
int num_answers = cur_task.seq_tokens.size();
if (s0 + num_answers > max_seq) {
@@ -1547,22 +1485,17 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
}
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
n_logits += 1;
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
const size_t seq_tokens_size = cur_task.seq_tokens[s].size();
// TODO: don't evaluate the last token of each sequence
for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
const bool needs_logits = i < seq_tokens_size - 1;
llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
n_logits += needs_logits;
for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size(); ++i) {
llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true);
}
}
s0 += num_answers;
cur_task.i_logits = i_logits;
i_logits += n_logits;
cur_task.i_batch = i_batch;
i_batch += cur_task.required_tokens;
n_cur += cur_task.required_tokens;
if (++i1 == tasks.size()) {
@@ -1588,11 +1521,12 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
eval_pairs.clear();
for (size_t i = i0; i < i1; ++i) {
auto& cur_task = tasks[i];
size_t li = 1; // skip the last logit of the common prefix (computed separately below)
size_t li = cur_task.common_prefix;
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
eval_pairs.emplace_back(cur_task.i_logits + li++, cur_task.seq_tokens[s][j + 1]);
eval_pairs.emplace_back(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]);
}
++li;
}
}
// Then we do the actual calculation
@@ -1611,8 +1545,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
//}
//printf("\n common_prefix: %zu\n", cur_task.common_prefix);
// get the logits of the last token of the common prefix
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float));
const auto first_probs = softmax(tok_logits);
@@ -1764,7 +1697,6 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}
// TODO: use llama_batch.logits instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
@@ -1850,24 +1782,13 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
int main(int argc, char ** argv) {
gpt_params params;
params.n_batch = 512;
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}
params.logits_all = true;
const int32_t n_ctx = params.n_ctx;
const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence;
if (ppl) {
int n_seq = std::max(1, params.n_batch / n_ctx);
int32_t n_kv = n_seq * n_ctx;
params.n_parallel = n_seq;
params.n_ctx = n_kv;
params.n_batch = std::min(params.n_batch, n_kv);
} else {
params.n_batch = std::min(params.n_batch, params.n_ctx);
}
params.n_batch = std::min(params.n_batch, params.n_ctx);
if (params.ppl_stride > 0) {
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
@@ -1894,9 +1815,6 @@ int main(int argc, char ** argv) {
llama_model * model;
llama_context * ctx;
// ensure there's at least enough seq_ids for HellaSwag
params.n_parallel = std::max(4, params.n_parallel);
// load the model and apply lora adapter, if any
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
@@ -1926,7 +1844,7 @@ int main(int argc, char ** argv) {
} else if (params.kl_divergence) {
kl_divergence(ctx, params);
} else {
results = perplexity(ctx, params, n_ctx);
results = perplexity(ctx, params);
}
llama_print_timings(ctx);

View File

@@ -23,22 +23,16 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , },
{ "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.25 bpw non-linear quantization", },
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
@@ -88,17 +82,13 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
//
[[noreturn]]
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
printf("\nAllowed quantization types:\n");
for (auto & it : QUANT_OPTIONS) {
@@ -112,14 +102,14 @@ static void usage(const char * executable) {
exit(1);
}
static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
if (!in) {
printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
printf("%s: failed to open %s\n",__func__,imatrix_file.c_str());
return;
}
int n_entries;
in.read((char *)&n_entries, sizeof(n_entries));
in.read((char*)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
return;
@@ -129,25 +119,25 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str());
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str());
return;
}
name_as_vec[len] = 0;
std::string name{name_as_vec.data()};
auto & e = imatrix_data[std::move(name)];
auto& e = imatrix_data[std::move(name)];
int ncall;
in.read((char *)&ncall, sizeof(ncall));
in.read((char*)&ncall, sizeof(ncall));
int nval;
in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n", __func__, i);
printf("%s: failed reading number of values for entry %d\n",__func__,i);
imatrix_data = {};
return;
}
e.resize(nval);
in.read((char *)e.data(), nval*sizeof(float));
in.read((char*)e.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n", __func__, i);
printf("%s: failed reading data for entry %d\n",__func__,i);
imatrix_data = {};
return;
}
@@ -155,13 +145,13 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
for (auto& v : e) v /= ncall;
}
}
printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str());
}
static void prepare_imatrix(const std::string & imatrix_file,
const std::vector<std::string> & included_weights,
const std::vector<std::string> & excluded_weights,
std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
static void prepare_imatrix(const std::string& imatrix_file,
const std::vector<std::string>& included_weights,
const std::vector<std::string>& excluded_weights,
std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
if (!imatrix_file.empty()) {
load_imatrix(imatrix_file, imatrix_data);
}
@@ -194,55 +184,6 @@ static void prepare_imatrix(const std::string & imatrix_file,
}
}
static ggml_type parse_ggml_type(const char * arg) {
ggml_type result = GGML_TYPE_COUNT;
for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
auto type = ggml_type(j);
const auto * name = ggml_type_name(type);
if (name && strcmp(arg, name) == 0) {
result = type; break;
}
}
return result;
}
static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
const char* sep = strchr(data, '=');
if (sep == nullptr || sep - data >= 128) {
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
return false;
}
llama_model_kv_override kvo;
std::strncpy(kvo.key, data, sep - data);
kvo.key[sep - data] = 0;
sep++;
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.int_value = std::atol(sep);
} else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
kvo.float_value = std::atof(sep);
} else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.bool_value = true;
} else if (std::strcmp(sep, "false") == 0) {
kvo.bool_value = false;
} else {
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
return false;
}
} else {
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
return false;
}
overrides.emplace_back(std::move(kvo));
return true;
}
int main(int argc, char ** argv) {
if (argc < 3) {
usage(argv[0]);
@@ -253,27 +194,10 @@ int main(int argc, char ** argv) {
int arg_idx = 1;
std::string imatrix_file;
std::vector<std::string> included_weights, excluded_weights;
std::vector<llama_model_kv_override> kv_overrides;
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
params.quantize_output_tensor = false;
} else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
if (arg_idx < argc-1) {
params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
if (arg_idx < argc-1) {
params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
params.allow_requantize = true;
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
@@ -314,11 +238,6 @@ int main(int argc, char ** argv) {
if (!imatrix_data.empty()) {
params.imatrix = &imatrix_data;
}
if (!kv_overrides.empty()) {
kv_overrides.emplace_back();
kv_overrides.back().key[0] = 0;
params.kv_overrides = &kv_overrides;
}
llama_backend_init();
@@ -340,7 +259,8 @@ int main(int argc, char ** argv) {
if (ftype_str == "COPY") {
params.only_copy = true;
}
} else {
}
else {
fname_out = argv[arg_idx];
arg_idx++;
@@ -370,13 +290,10 @@ int main(int argc, char ** argv) {
}
if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S ||
params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
fprintf(stderr, "\n==========================================================================================================\n");
fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
fprintf(stderr, "==========================================================================================================\n\n\n");
params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && imatrix_data.empty()) {
fprintf(stderr, "\n===============================================================================================\n");
fprintf(stderr, "Please do not use IQ1_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
fprintf(stderr, "===============================================================================================\n\n\n");
return 1;
}

View File

@@ -1,20 +0,0 @@
import json, subprocess, sys, os
assert len(sys.argv) >= 2
[_, pattern, *rest] = sys.argv
print(subprocess.check_output(
[
"python",
os.path.join(
os.path.dirname(os.path.realpath(__file__)),
"json-schema-to-grammar.py"),
*rest,
"-",
"--raw-pattern",
],
text=True,
input=json.dumps({
"type": "string",
"pattern": pattern,
}, indent=2)))

View File

@@ -1,5 +0,0 @@
set(TARGET retrieval)
add_executable(${TARGET} retrieval.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,69 +0,0 @@
# llama.cpp/examples/retrieval
Demonstration of simple retrieval technique based on cosine similarity
More info:
https://github.com/ggerganov/llama.cpp/pull/6193
### How to use
`retieval.cpp` has parameters of its own:
- `--context-file`: file to be embedded - state this option multiple times to embed multiple files
- `--chunk-size`: minimum size of each text chunk to be embedded
- `--chunk-separator`: STRING to divide chunks by. newline by default
`retrieval` example can be tested as follows:
```bash
make -j && ./retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
```
This chunks and embeds all given files and starts a loop requesting query inputs:
```
Enter query:
```
On each query input, top k chunks are shown along with file name, chunk position within file and original text:
```
Enter query: describe the mit license
batch_decode: n_tokens = 6, n_seq = 1
Top 3 similar chunks:
filename: README.md
filepos: 119
similarity: 0.762334
textdata:
png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[Roadmap](https://github.
--------------------
filename: License
filepos: 0
similarity: 0.725146
textdata:
MIT License
Copyright (c) 2023 Georgi Gerganov
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
--------------------
filename: README.md
filepos: 9178
similarity: 0.621722
textdata:
com/cztomsik/ava) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [pythops/tenere](https://github.
--------------------
```

Some files were not shown because too many files have changed in this diff Show More