mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-16 16:27:32 +03:00
Compare commits
107 Commits
b1081
...
build-meta
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
30ac7a4117 | ||
|
|
28eea84ac0 | ||
|
|
65520729a2 | ||
|
|
ac4038aab1 | ||
|
|
23360b15b6 | ||
|
|
bd33e5ab92 | ||
|
|
3103568144 | ||
|
|
5b8530d88c | ||
|
|
e4386f417f | ||
|
|
35195689cd | ||
|
|
cf9b08485c | ||
|
|
47068e5170 | ||
|
|
8f429fa511 | ||
|
|
6519e9c99c | ||
|
|
b7f2aa9e51 | ||
|
|
73a12a6344 | ||
|
|
3730134776 | ||
|
|
d9151e6f57 | ||
|
|
afc43d5f82 | ||
|
|
6460f758db | ||
|
|
ca82cf7bac | ||
|
|
323a9d3b8c | ||
|
|
99161230c4 | ||
|
|
15f1790a75 | ||
|
|
b59beebdbf | ||
|
|
4de22829d9 | ||
|
|
6a31a3bd98 | ||
|
|
cff7b0bf07 | ||
|
|
340af42f09 | ||
|
|
c42f0ec6b3 | ||
|
|
2753415afd | ||
|
|
bc054af97a | ||
|
|
3358c381f6 | ||
|
|
52315a4216 | ||
|
|
8b56b4f2c3 | ||
|
|
21f3d1be86 | ||
|
|
571083f508 | ||
|
|
f04d002844 | ||
|
|
bcf62ba7b4 | ||
|
|
e966ae0574 | ||
|
|
69fdbb9abc | ||
|
|
5d6f19f16b | ||
|
|
0d58936686 | ||
|
|
6c9c23429b | ||
|
|
ee8654bcd0 | ||
|
|
49bb9cbe0f | ||
|
|
ef15649972 | ||
|
|
d8d6977f48 | ||
|
|
5aec2cfaac | ||
|
|
13268c5331 | ||
|
|
4dcd47d71d | ||
|
|
18705a30ef | ||
|
|
e8d9158925 | ||
|
|
bce1fef328 | ||
|
|
528134dd02 | ||
|
|
aeefac4ff7 | ||
|
|
e8422de39e | ||
|
|
92d0b751a7 | ||
|
|
8afe228000 | ||
|
|
71d6975559 | ||
|
|
b532a69b2f | ||
|
|
c90d135eb4 | ||
|
|
0d1c706181 | ||
|
|
9509294420 | ||
|
|
35092fb547 | ||
|
|
dc07dc492e | ||
|
|
ad9ddcff6e | ||
|
|
8341a25957 | ||
|
|
849408957c | ||
|
|
06abf8eeba | ||
|
|
c03a243abf | ||
|
|
fa3582f509 | ||
|
|
e37e69dcc3 | ||
|
|
53885d7256 | ||
|
|
bcce96ba4d | ||
|
|
74e0caeb82 | ||
|
|
d4b5e16c32 | ||
|
|
3a007648f2 | ||
|
|
611363ac79 | ||
|
|
95b6e5212f | ||
|
|
44c117f41e | ||
|
|
43033b7bb4 | ||
|
|
6b73ef1201 | ||
|
|
75fafcbccc | ||
|
|
be475f60af | ||
|
|
3af6b86301 | ||
|
|
35feac6560 | ||
|
|
92b1bbd2ec | ||
|
|
dd0dc366da | ||
|
|
f55538c3cc | ||
|
|
ebcee207b6 | ||
|
|
3e8ff47af6 | ||
|
|
103cfafc77 | ||
|
|
c10704d01e | ||
|
|
230d46c723 | ||
|
|
463173a6c0 | ||
|
|
eaa13a48ff | ||
|
|
da7455d046 | ||
|
|
25423e9185 | ||
|
|
a6d1189fdd | ||
|
|
c48c5bb0b0 | ||
|
|
d0cee0d36d | ||
|
|
edd4c14817 | ||
|
|
1591e2e590 | ||
|
|
789c8c945a | ||
|
|
c1ac54b77a | ||
|
|
730d9c681e |
@@ -13,12 +13,13 @@
|
||||
# It is up to the user to install the correct vendor-specific support.
|
||||
|
||||
Name: llama.cpp-clblast
|
||||
Version: master
|
||||
Version: %( date "+%%Y%%m%%d" )
|
||||
Release: 1%{?dist}
|
||||
Summary: OpenCL Inference of LLaMA model in pure C/C++
|
||||
Summary: OpenCL Inference of LLaMA model in C/C++
|
||||
License: MIT
|
||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
||||
BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel
|
||||
BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
|
||||
Requires: clblast
|
||||
URL: https://github.com/ggerganov/llama.cpp
|
||||
|
||||
%define debug_package %{nil}
|
||||
@@ -35,18 +36,43 @@ make -j LLAMA_CLBLAST=1
|
||||
|
||||
%install
|
||||
mkdir -p %{buildroot}%{_bindir}/
|
||||
cp -p main %{buildroot}%{_bindir}/llamacppclblast
|
||||
cp -p server %{buildroot}%{_bindir}/llamacppclblastserver
|
||||
cp -p simple %{buildroot}%{_bindir}/llamacppclblastsimple
|
||||
cp -p main %{buildroot}%{_bindir}/llamaclblast
|
||||
cp -p server %{buildroot}%{_bindir}/llamaclblastserver
|
||||
cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
|
||||
|
||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
|
||||
[Unit]
|
||||
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
||||
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
EnvironmentFile=/etc/sysconfig/llama
|
||||
ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
|
||||
ExecReload=/bin/kill -s HUP $MAINPID
|
||||
Restart=never
|
||||
|
||||
[Install]
|
||||
WantedBy=default.target
|
||||
EOF
|
||||
|
||||
mkdir -p %{buildroot}/etc/sysconfig
|
||||
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
||||
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
||||
EOF
|
||||
|
||||
%clean
|
||||
rm -rf %{buildroot}
|
||||
rm -rf %{_builddir}/*
|
||||
|
||||
%files
|
||||
%{_bindir}/llamacppclblast
|
||||
%{_bindir}/llamacppclblastserver
|
||||
%{_bindir}/llamacppclblastsimple
|
||||
%{_bindir}/llamaclblast
|
||||
%{_bindir}/llamaclblastserver
|
||||
%{_bindir}/llamaclblastsimple
|
||||
/usr/lib/systemd/system/llamaclblast.service
|
||||
%config /etc/sysconfig/llama
|
||||
|
||||
|
||||
%pre
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
# It is up to the user to install the correct vendor-specific support.
|
||||
|
||||
Name: llama.cpp-cublas
|
||||
Version: master
|
||||
Version: %( date "+%%Y%%m%%d" )
|
||||
Release: 1%{?dist}
|
||||
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
||||
License: MIT
|
||||
@@ -40,6 +40,28 @@ cp -p main %{buildroot}%{_bindir}/llamacppcublas
|
||||
cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
|
||||
cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
|
||||
|
||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacublas.service
|
||||
[Unit]
|
||||
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
||||
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
EnvironmentFile=/etc/sysconfig/llama
|
||||
ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
|
||||
ExecReload=/bin/kill -s HUP $MAINPID
|
||||
Restart=never
|
||||
|
||||
[Install]
|
||||
WantedBy=default.target
|
||||
EOF
|
||||
|
||||
mkdir -p %{buildroot}/etc/sysconfig
|
||||
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
||||
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
||||
EOF
|
||||
|
||||
%clean
|
||||
rm -rf %{buildroot}
|
||||
rm -rf %{_builddir}/*
|
||||
@@ -48,6 +70,8 @@ rm -rf %{_builddir}/*
|
||||
%{_bindir}/llamacppcublas
|
||||
%{_bindir}/llamacppcublasserver
|
||||
%{_bindir}/llamacppcublassimple
|
||||
/usr/lib/systemd/system/llamacublas.service
|
||||
%config /etc/sysconfig/llama
|
||||
|
||||
%pre
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
# Notes for llama.cpp:
|
||||
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
||||
# We need to declare standard versioning if people want to sort latest releases.
|
||||
# In the meantime, YYYYMMDD format will be used.
|
||||
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
||||
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
||||
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
||||
@@ -13,12 +14,13 @@
|
||||
# It is up to the user to install the correct vendor-specific support.
|
||||
|
||||
Name: llama.cpp
|
||||
Version: master
|
||||
Version: %( date "+%%Y%%m%%d" )
|
||||
Release: 1%{?dist}
|
||||
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
||||
License: MIT
|
||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
||||
BuildRequires: coreutils make gcc-c++ git
|
||||
BuildRequires: coreutils make gcc-c++ git libstdc++-devel
|
||||
Requires: libstdc++
|
||||
URL: https://github.com/ggerganov/llama.cpp
|
||||
|
||||
%define debug_package %{nil}
|
||||
@@ -26,27 +28,52 @@ URL: https://github.com/ggerganov/llama.cpp
|
||||
|
||||
%description
|
||||
CPU inference for Meta's Lllama2 models using default options.
|
||||
Models are not included in this package and must be downloaded separately.
|
||||
|
||||
%prep
|
||||
%autosetup
|
||||
%setup -n llama.cpp-master
|
||||
|
||||
%build
|
||||
make -j
|
||||
|
||||
%install
|
||||
mkdir -p %{buildroot}%{_bindir}/
|
||||
cp -p main %{buildroot}%{_bindir}/llamacpp
|
||||
cp -p server %{buildroot}%{_bindir}/llamacppserver
|
||||
cp -p simple %{buildroot}%{_bindir}/llamacppsimple
|
||||
cp -p main %{buildroot}%{_bindir}/llama
|
||||
cp -p server %{buildroot}%{_bindir}/llamaserver
|
||||
cp -p simple %{buildroot}%{_bindir}/llamasimple
|
||||
|
||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
||||
[Unit]
|
||||
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
||||
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
EnvironmentFile=/etc/sysconfig/llama
|
||||
ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
|
||||
ExecReload=/bin/kill -s HUP $MAINPID
|
||||
Restart=never
|
||||
|
||||
[Install]
|
||||
WantedBy=default.target
|
||||
EOF
|
||||
|
||||
mkdir -p %{buildroot}/etc/sysconfig
|
||||
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
||||
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
||||
EOF
|
||||
|
||||
%clean
|
||||
rm -rf %{buildroot}
|
||||
rm -rf %{_builddir}/*
|
||||
|
||||
%files
|
||||
%{_bindir}/llamacpp
|
||||
%{_bindir}/llamacppserver
|
||||
%{_bindir}/llamacppsimple
|
||||
%{_bindir}/llama
|
||||
%{_bindir}/llamaserver
|
||||
%{_bindir}/llamasimple
|
||||
/usr/lib/systemd/system/llama.service
|
||||
%config /etc/sysconfig/llama
|
||||
|
||||
%pre
|
||||
|
||||
|
||||
@@ -7,15 +7,12 @@ arg1="$1"
|
||||
# Shift the arguments to remove the first one
|
||||
shift
|
||||
|
||||
# Join the remaining arguments into a single string
|
||||
arg2="$@"
|
||||
|
||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||
python3 ./convert.py "$arg2"
|
||||
python3 ./convert.py "$@"
|
||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||
./quantize "$arg2"
|
||||
./quantize "$@"
|
||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||
./main "$arg2"
|
||||
./main "$@"
|
||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||
echo "Converting PTH to GGML..."
|
||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||
@@ -27,7 +24,7 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||
fi
|
||||
done
|
||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||
./server "$arg2"
|
||||
./server "$@"
|
||||
else
|
||||
echo "Unknown command: $arg1"
|
||||
echo "Available commands: "
|
||||
|
||||
@@ -17,3 +17,6 @@ indent_style = tab
|
||||
|
||||
[prompts/*.txt]
|
||||
insert_final_newline = unset
|
||||
|
||||
[examples/server/public/*]
|
||||
indent_size = 2
|
||||
|
||||
13
.github/workflows/build.yml
vendored
13
.github/workflows/build.yml
vendored
@@ -18,7 +18,6 @@ on:
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
GGML_NLOOP: 3
|
||||
GGML_NITER: 1
|
||||
GGML_N_THREADS: 1
|
||||
|
||||
jobs:
|
||||
@@ -41,6 +40,12 @@ jobs:
|
||||
run: |
|
||||
CC=gcc-8 make
|
||||
|
||||
- name: Test
|
||||
id: make_test
|
||||
run: |
|
||||
CC=gcc-8 make tests
|
||||
make test
|
||||
|
||||
ubuntu-latest-cmake:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -157,6 +162,12 @@ jobs:
|
||||
run: |
|
||||
make
|
||||
|
||||
- name: Test
|
||||
id: make_test
|
||||
run: |
|
||||
make tests
|
||||
make test
|
||||
|
||||
macOS-latest-cmake:
|
||||
runs-on: macos-latest
|
||||
|
||||
|
||||
36
.github/workflows/code-coverage.yml
vendored
Normal file
36
.github/workflows/code-coverage.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
name: Code Coverage
|
||||
on: [push, pull_request]
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
|
||||
jobs:
|
||||
run:
|
||||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential gcc-8 lcov
|
||||
|
||||
- name: Build
|
||||
run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
|
||||
|
||||
- name: Run tests
|
||||
run: CC=gcc-8 make test
|
||||
|
||||
- name: Generate coverage report
|
||||
run: |
|
||||
make coverage
|
||||
make lcov-report
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v3
|
||||
env:
|
||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||
with:
|
||||
files: lcov-report/coverage.info
|
||||
43
.github/workflows/gguf-publish.yml
vendored
Normal file
43
.github/workflows/gguf-publish.yml
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
# This workflow will upload a Python Package using Twine when a GGUF release is created
|
||||
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
|
||||
|
||||
# See `gguf-py/README.md` for how to make a release.
|
||||
|
||||
# This workflow uses actions that are not certified by GitHub.
|
||||
# They are provided by a third-party and are governed by
|
||||
# separate terms of service, privacy policy, and support
|
||||
# documentation.
|
||||
|
||||
name: Upload Python Package
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
# Pattern matched against refs/tags
|
||||
tags:
|
||||
- 'gguf-v*' # Push events to every version tag
|
||||
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: '3.9.x'
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
cd gguf-py
|
||||
python -m pip install poetry
|
||||
poetry install
|
||||
|
||||
- name: Build package
|
||||
run: poetry build
|
||||
- name: Publish package
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
42
.gitignore
vendored
42
.gitignore
vendored
@@ -5,6 +5,11 @@
|
||||
*.bin
|
||||
*.exe
|
||||
*.dll
|
||||
*.log
|
||||
*.gcov
|
||||
*.gcno
|
||||
*.gcda
|
||||
*.dot
|
||||
.DS_Store
|
||||
.build/
|
||||
.cache/
|
||||
@@ -16,6 +21,9 @@
|
||||
.vs/
|
||||
.vscode/
|
||||
|
||||
lcov-report/
|
||||
gcovr-report/
|
||||
|
||||
build*/
|
||||
out/
|
||||
tmp/
|
||||
@@ -23,24 +31,29 @@ tmp/
|
||||
models/*
|
||||
models-mnt
|
||||
|
||||
/main
|
||||
/quantize
|
||||
/quantize-stats
|
||||
/result
|
||||
/perplexity
|
||||
/embedding
|
||||
/train-text-from-scratch
|
||||
/convert-llama2c-to-ggml
|
||||
/simple
|
||||
/benchmark-matmult
|
||||
/vdot
|
||||
/server
|
||||
/Pipfile
|
||||
/baby-llama
|
||||
/beam-search
|
||||
/benchmark-matmult
|
||||
/convert-llama2c-to-ggml
|
||||
/embd-input-test
|
||||
/embedding
|
||||
/gguf
|
||||
/gguf-llama-simple
|
||||
/libllama.so
|
||||
/llama-bench
|
||||
/main
|
||||
/metal
|
||||
/perplexity
|
||||
/quantize
|
||||
/quantize-stats
|
||||
/result
|
||||
/save-load-state
|
||||
/server
|
||||
/simple
|
||||
/speculative
|
||||
/train-text-from-scratch
|
||||
/vdot
|
||||
build-info.h
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
@@ -63,10 +76,13 @@ poetry.toml
|
||||
|
||||
# Test binaries
|
||||
tests/test-grammar-parser
|
||||
tests/test-llama-grammar
|
||||
tests/test-double-float
|
||||
tests/test-grad0
|
||||
tests/test-opt
|
||||
tests/test-quantize-fns
|
||||
tests/test-quantize-perf
|
||||
tests/test-sampling
|
||||
tests/test-tokenizer-0
|
||||
tests/test-tokenizer-0-llama
|
||||
tests/test-tokenizer-0-falcon
|
||||
tests/test-tokenizer-1
|
||||
|
||||
@@ -36,6 +36,12 @@ endif()
|
||||
# Option list
|
||||
#
|
||||
|
||||
if (APPLE)
|
||||
set(LLAMA_METAL_DEFAULT ON)
|
||||
else()
|
||||
set(LLAMA_METAL_DEFAULT OFF)
|
||||
endif()
|
||||
|
||||
# general
|
||||
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
||||
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
|
||||
@@ -76,7 +82,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
|
||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
option(LLAMA_METAL "llama: use Metal" OFF)
|
||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
@@ -158,6 +164,31 @@ if (APPLE AND LLAMA_ACCELERATE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_METAL)
|
||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
|
||||
message(STATUS "Metal framework found")
|
||||
|
||||
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
|
||||
|
||||
add_compile_definitions(GGML_USE_METAL)
|
||||
#add_compile_definitions(GGML_METAL_NDEBUG)
|
||||
|
||||
# get full path to the file
|
||||
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
|
||||
|
||||
# copy ggml-metal.metal to bin directory
|
||||
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
|
||||
${FOUNDATION_LIBRARY}
|
||||
${METAL_FRAMEWORK}
|
||||
${METALKIT_FRAMEWORK}
|
||||
)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BLAS)
|
||||
if (LLAMA_STATIC)
|
||||
set(BLA_STATIC ON)
|
||||
@@ -293,29 +324,6 @@ if (LLAMA_CUBLAS)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_METAL)
|
||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
|
||||
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
|
||||
|
||||
add_compile_definitions(GGML_USE_METAL)
|
||||
add_compile_definitions(GGML_METAL_NDEBUG)
|
||||
|
||||
# get full path to the file
|
||||
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
|
||||
|
||||
# copy ggml-metal.metal to bin directory
|
||||
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
|
||||
${FOUNDATION_LIBRARY}
|
||||
${METAL_FRAMEWORK}
|
||||
${METALKIT_FRAMEWORK}
|
||||
)
|
||||
endif()
|
||||
|
||||
if (LLAMA_MPI)
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
find_package(MPI)
|
||||
@@ -402,6 +410,8 @@ if (LLAMA_ALL_WARNINGS)
|
||||
-Wstrict-prototypes
|
||||
-Wpointer-arith
|
||||
-Wmissing-prototypes
|
||||
-Werror=implicit-int
|
||||
-Wno-unused-function
|
||||
)
|
||||
set(cxx_flags
|
||||
-Wall
|
||||
@@ -411,6 +421,10 @@ if (LLAMA_ALL_WARNINGS)
|
||||
-Wno-unused-function
|
||||
-Wno-multichar
|
||||
)
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
# g++ only
|
||||
set(cxx_flags ${cxx_flags} -Wno-format-truncation)
|
||||
endif()
|
||||
else()
|
||||
# todo : msvc
|
||||
endif()
|
||||
|
||||
309
Makefile
309
Makefile
@@ -1,10 +1,11 @@
|
||||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf llama-bench
|
||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
|
||||
|
||||
# Binaries only useful for tests
|
||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
|
||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
|
||||
|
||||
default: $(BUILD_TARGETS)
|
||||
# Code coverage output files
|
||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||
|
||||
ifndef UNAME_S
|
||||
UNAME_S := $(shell uname -s)
|
||||
@@ -18,12 +19,13 @@ ifndef UNAME_M
|
||||
UNAME_M := $(shell uname -m)
|
||||
endif
|
||||
|
||||
CCV := $(shell $(CC) --version | head -n 1)
|
||||
CXXV := $(shell $(CXX) --version | head -n 1)
|
||||
|
||||
# Mac OS + Arm can report x86_64
|
||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
ifndef LLAMA_NO_METAL
|
||||
LLAMA_METAL := 1
|
||||
endif
|
||||
|
||||
ifneq ($(UNAME_P),arm)
|
||||
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
|
||||
ifeq ($(SYSCTL_M),1)
|
||||
@@ -34,6 +36,49 @@ ifeq ($(UNAME_S),Darwin)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
|
||||
BUILD_TARGETS += metal
|
||||
endif
|
||||
|
||||
default: $(BUILD_TARGETS)
|
||||
|
||||
test:
|
||||
@echo "Running tests..."
|
||||
@for test_target in $(TEST_TARGETS); do \
|
||||
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
|
||||
continue; \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
|
||||
continue; \
|
||||
else \
|
||||
./$$test_target; \
|
||||
fi; \
|
||||
done
|
||||
@echo "All tests have been run."
|
||||
|
||||
all: $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
|
||||
coverage: ## Run code coverage
|
||||
gcov -pb tests/*.cpp
|
||||
|
||||
lcov-report: coverage ## Generate lcov report
|
||||
mkdir -p lcov-report
|
||||
lcov --capture --directory . --output-file lcov-report/coverage.info
|
||||
genhtml lcov-report/coverage.info --output-directory lcov-report
|
||||
|
||||
gcovr-report: coverage ## Generate gcovr report
|
||||
mkdir -p gcovr-report
|
||||
gcovr --root . --html --html-details --output gcovr-report/coverage.html
|
||||
|
||||
ifdef RISCV_CROSS_COMPILE
|
||||
CC := riscv64-unknown-linux-gnu-gcc
|
||||
CXX := riscv64-unknown-linux-gnu-g++
|
||||
endif
|
||||
|
||||
CCV := $(shell $(CC) --version | head -n 1)
|
||||
CXXV := $(shell $(CXX) --version | head -n 1)
|
||||
|
||||
#
|
||||
# Compile flags
|
||||
#
|
||||
@@ -45,53 +90,48 @@ OPT = -Ofast
|
||||
else
|
||||
OPT = -O3
|
||||
endif
|
||||
CFLAGS = -I. $(OPT) -std=c11 -fPIC
|
||||
CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC
|
||||
LDFLAGS =
|
||||
MK_CPPFLAGS = -I. -Icommon
|
||||
MK_CFLAGS = $(CPPFLAGS) $(OPT) -std=c11 -fPIC
|
||||
MK_CXXFLAGS = $(CPPFLAGS) $(OPT) -std=c++11 -fPIC
|
||||
MK_LDFLAGS =
|
||||
|
||||
ifdef LLAMA_DEBUG
|
||||
CFLAGS += -O0 -g
|
||||
CXXFLAGS += -O0 -g
|
||||
LDFLAGS += -g
|
||||
MK_CFLAGS += -O0 -g
|
||||
MK_CXXFLAGS += -O0 -g
|
||||
MK_LDFLAGS += -g
|
||||
else
|
||||
CFLAGS += -DNDEBUG
|
||||
CXXFLAGS += -DNDEBUG
|
||||
MK_CPPFLAGS += -DNDEBUG
|
||||
endif
|
||||
|
||||
ifdef LLAMA_SERVER_VERBOSE
|
||||
CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
|
||||
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
|
||||
endif
|
||||
|
||||
|
||||
ifdef LLAMA_CODE_COVERAGE
|
||||
CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
|
||||
endif
|
||||
|
||||
ifdef LLAMA_DISABLE_LOGS
|
||||
CFLAGS += -DLOG_DISABLE_LOGS
|
||||
CXXFLAGS += -DLOG_DISABLE_LOGS
|
||||
endif # LLAMA_DISABLE_LOGS
|
||||
|
||||
# warnings
|
||||
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
|
||||
-Wmissing-prototypes
|
||||
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
|
||||
MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
|
||||
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
|
||||
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
|
||||
|
||||
ifeq '' '$(findstring clang++,$(CXX))'
|
||||
# g++ only
|
||||
CXXFLAGS += -Wno-format-truncation
|
||||
endif
|
||||
|
||||
# OS specific
|
||||
# TODO: support Windows
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
CFLAGS += -pthread
|
||||
CXXFLAGS += -pthread
|
||||
endif
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CFLAGS += -pthread
|
||||
CXXFLAGS += -pthread
|
||||
endif
|
||||
ifeq ($(UNAME_S),FreeBSD)
|
||||
CFLAGS += -pthread
|
||||
CXXFLAGS += -pthread
|
||||
endif
|
||||
ifeq ($(UNAME_S),NetBSD)
|
||||
CFLAGS += -pthread
|
||||
CXXFLAGS += -pthread
|
||||
endif
|
||||
ifeq ($(UNAME_S),OpenBSD)
|
||||
CFLAGS += -pthread
|
||||
CXXFLAGS += -pthread
|
||||
endif
|
||||
ifeq ($(UNAME_S),Haiku)
|
||||
CFLAGS += -pthread
|
||||
CXXFLAGS += -pthread
|
||||
ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
|
||||
MK_CFLAGS += -pthread
|
||||
MK_CXXFLAGS += -pthread
|
||||
endif
|
||||
|
||||
# detect Windows
|
||||
@@ -117,104 +157,127 @@ ifeq ($(_WIN32),1)
|
||||
endif
|
||||
|
||||
ifdef LLAMA_GPROF
|
||||
CFLAGS += -pg
|
||||
CXXFLAGS += -pg
|
||||
MK_CFLAGS += -pg
|
||||
MK_CXXFLAGS += -pg
|
||||
endif
|
||||
ifdef LLAMA_PERF
|
||||
CFLAGS += -DGGML_PERF
|
||||
CXXFLAGS += -DGGML_PERF
|
||||
MK_CPPFLAGS += -DGGML_PERF
|
||||
endif
|
||||
|
||||
# Architecture specific
|
||||
# TODO: probably these flags need to be tweaked on some architectures
|
||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||
|
||||
ifndef RISCV
|
||||
|
||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
||||
# Use all CPU extensions that are available:
|
||||
CFLAGS += -march=native -mtune=native
|
||||
CXXFLAGS += -march=native -mtune=native
|
||||
MK_CFLAGS += -march=native -mtune=native
|
||||
MK_CXXFLAGS += -march=native -mtune=native
|
||||
|
||||
# Usage AVX-only
|
||||
#CFLAGS += -mfma -mf16c -mavx
|
||||
#CXXFLAGS += -mfma -mf16c -mavx
|
||||
#MK_CFLAGS += -mfma -mf16c -mavx
|
||||
#MK_CXXFLAGS += -mfma -mf16c -mavx
|
||||
|
||||
# Usage SSSE3-only (Not is SSE3!)
|
||||
#CFLAGS += -mssse3
|
||||
#CXXFLAGS += -mssse3
|
||||
#MK_CFLAGS += -mssse3
|
||||
#MK_CXXFLAGS += -mssse3
|
||||
endif
|
||||
|
||||
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
|
||||
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
|
||||
# https://github.com/ggerganov/llama.cpp/issues/2922
|
||||
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
|
||||
CFLAGS += -Xassembler -muse-unaligned-vector-move
|
||||
CXXFLAGS += -Xassembler -muse-unaligned-vector-move
|
||||
endif
|
||||
|
||||
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
||||
# Apple M1, M2, etc.
|
||||
# Raspberry Pi 3, 4, Zero 2 (64-bit)
|
||||
CFLAGS += -mcpu=native
|
||||
CXXFLAGS += -mcpu=native
|
||||
MK_CFLAGS += -mcpu=native
|
||||
MK_CXXFLAGS += -mcpu=native
|
||||
endif
|
||||
|
||||
ifneq ($(filter armv6%,$(UNAME_M)),)
|
||||
# Raspberry Pi 1, Zero
|
||||
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
|
||||
MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
|
||||
MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
|
||||
endif
|
||||
|
||||
ifneq ($(filter armv7%,$(UNAME_M)),)
|
||||
# Raspberry Pi 2
|
||||
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
|
||||
MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
|
||||
MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
|
||||
endif
|
||||
|
||||
ifneq ($(filter armv8%,$(UNAME_M)),)
|
||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
|
||||
MK_CFLAGS += -mfp16-format=ieee -mno-unaligned-access
|
||||
MK_CXXFLAGS += -mfp16-format=ieee -mno-unaligned-access
|
||||
endif
|
||||
|
||||
ifneq ($(filter ppc64%,$(UNAME_M)),)
|
||||
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
|
||||
ifneq (,$(findstring POWER9,$(POWER9_M)))
|
||||
CFLAGS += -mcpu=power9
|
||||
CXXFLAGS += -mcpu=power9
|
||||
endif
|
||||
# Require c++23's std::byteswap for big-endian support.
|
||||
ifeq ($(UNAME_M),ppc64)
|
||||
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
|
||||
MK_CFLAGS += -mcpu=power9
|
||||
MK_CXXFLAGS += -mcpu=power9
|
||||
endif
|
||||
endif
|
||||
|
||||
else
|
||||
CFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifndef LLAMA_NO_K_QUANTS
|
||||
CFLAGS += -DGGML_USE_K_QUANTS
|
||||
CXXFLAGS += -DGGML_USE_K_QUANTS
|
||||
MK_CPPFLAGS += -DGGML_USE_K_QUANTS
|
||||
OBJS += k_quants.o
|
||||
ifdef LLAMA_QKK_64
|
||||
CFLAGS += -DGGML_QKK_64
|
||||
CXXFLAGS += -DGGML_QKK_64
|
||||
MK_CPPFLAGS += -DGGML_QKK_64
|
||||
endif
|
||||
endif
|
||||
|
||||
ifndef LLAMA_NO_ACCELERATE
|
||||
# Mac M1 - include Accelerate framework.
|
||||
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
|
||||
# Mac OS - include Accelerate framework.
|
||||
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CFLAGS += -DGGML_USE_ACCELERATE
|
||||
LDFLAGS += -framework Accelerate
|
||||
MK_CPPFLAGS += -DGGML_USE_ACCELERATE
|
||||
MK_LDFLAGS += -framework Accelerate
|
||||
endif
|
||||
endif # LLAMA_NO_ACCELERATE
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
# By default - use GPU acceleration on Mac OS
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG
|
||||
CXXFLAGS += -DGGML_USE_METAL
|
||||
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
|
||||
OBJS += ggml-metal.o
|
||||
endif
|
||||
endif # LLAMA_METAL
|
||||
|
||||
ifdef LLAMA_MPI
|
||||
CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
|
||||
CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
|
||||
MK_CPPFLAGS += -DGGML_USE_MPI
|
||||
MK_CFLAGS += -Wno-cast-qual
|
||||
MK_CXXFLAGS += -Wno-cast-qual
|
||||
OBJS += ggml-mpi.o
|
||||
endif # LLAMA_MPI
|
||||
|
||||
ifdef LLAMA_OPENBLAS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas)
|
||||
LDFLAGS += $(shell pkg-config --libs openblas)
|
||||
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
||||
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
||||
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
||||
endif # LLAMA_OPENBLAS
|
||||
|
||||
ifdef LLAMA_BLIS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||
LDFLAGS += -lblis -L/usr/local/lib
|
||||
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||
MK_LDFLAGS += -lblis -L/usr/local/lib
|
||||
endif # LLAMA_BLIS
|
||||
|
||||
ifdef LLAMA_CUBLAS
|
||||
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||
OBJS += ggml-cuda.o
|
||||
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
|
||||
ifdef LLAMA_CUDA_NVCC
|
||||
@@ -265,14 +328,15 @@ endif # LLAMA_CUBLAS
|
||||
|
||||
ifdef LLAMA_CLBLAST
|
||||
|
||||
CFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
|
||||
CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
|
||||
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
|
||||
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
||||
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
||||
|
||||
# Mac provides OpenCL as a framework
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
LDFLAGS += -lclblast -framework OpenCL
|
||||
MK_LDFLAGS += -lclblast -framework OpenCL
|
||||
else
|
||||
LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
|
||||
MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
|
||||
endif
|
||||
OBJS += ggml-opencl.o
|
||||
|
||||
@@ -287,10 +351,9 @@ ifdef LLAMA_HIPBLAS
|
||||
LLAMA_CUDA_DMMV_X ?= 32
|
||||
LLAMA_CUDA_MMV_Y ?= 1
|
||||
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
||||
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
|
||||
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
|
||||
LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
||||
LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
||||
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
|
||||
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
||||
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
||||
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
|
||||
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
||||
@@ -305,10 +368,9 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
endif # LLAMA_HIPBLAS
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
|
||||
CXXFLAGS += -DGGML_USE_METAL
|
||||
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
|
||||
OBJS += ggml-metal.o
|
||||
MK_CPPFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG
|
||||
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
|
||||
OBJS += ggml-metal.o
|
||||
endif # LLAMA_METAL
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
@@ -321,11 +383,17 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_MPI
|
||||
|
||||
ifdef LLAMA_NO_K_QUANTS
|
||||
ifndef LLAMA_NO_K_QUANTS
|
||||
k_quants.o: k_quants.c k_quants.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_NO_K_QUANTS
|
||||
|
||||
# combine build flags with cmdline overrides
|
||||
override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
|
||||
override CFLAGS := $(MK_CFLAGS) $(CFLAGS)
|
||||
override CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
|
||||
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
||||
|
||||
#
|
||||
# Print build information
|
||||
#
|
||||
@@ -356,7 +424,7 @@ OBJS += ggml-alloc.o
|
||||
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common.o: common/common.cpp common/common.h
|
||||
common.o: common/common.cpp common/common.h build-info.h common/log.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
console.o: common/console.cpp common/console.h
|
||||
@@ -369,7 +437,7 @@ libllama.so: llama.o ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
clean:
|
||||
rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf llama-bench build-info.h $(TEST_TARGETS)
|
||||
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
|
||||
#
|
||||
# Examples
|
||||
@@ -409,18 +477,32 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
|
||||
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
||||
|
||||
gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
endif
|
||||
|
||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||
@sh scripts/build-info.sh > $@.tmp
|
||||
@if ! cmp -s $@.tmp $@; then \
|
||||
@@ -442,29 +524,38 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
|
||||
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-0: tests/test-tokenizer-0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-1: tests/test-tokenizer-1.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-c.o: tests/test-c.c llama.h
|
||||
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
|
||||
|
||||
@@ -12,9 +12,18 @@ let package = Package(
|
||||
name: "llama",
|
||||
path: ".",
|
||||
exclude: ["ggml-metal.metal"],
|
||||
sources: ["ggml.c", "llama.cpp"],
|
||||
sources: [
|
||||
"ggml.c",
|
||||
"llama.cpp",
|
||||
"ggml-alloc.c",
|
||||
"k_quants.c"
|
||||
],
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
|
||||
cSettings: [
|
||||
.unsafeFlags(["-Wno-shorten-64-to-32"]),
|
||||
.define("GGML_USE_K_QUANTS"),
|
||||
.define("GGML_USE_ACCELERATE")
|
||||
],
|
||||
linkerSettings: [
|
||||
.linkedFramework("Accelerate")
|
||||
]
|
||||
|
||||
77
README.md
77
README.md
@@ -11,6 +11,10 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
### Hot topics
|
||||
|
||||
- #### IMPORTANT: Tokenizer fixes and API change (developers and projects using `llama.cpp` built-in tokenization must read): https://github.com/ggerganov/llama.cpp/pull/2810
|
||||
|
||||
- GGUFv2 adds support for 64-bit sizes + backwards compatible: https://github.com/ggerganov/llama.cpp/pull/2821
|
||||
|
||||
- Added support for Falcon models: https://github.com/ggerganov/llama.cpp/pull/2717
|
||||
|
||||
- A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
|
||||
@@ -103,17 +107,20 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
|
||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
|
||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp), [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
|
||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
|
||||
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
|
||||
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
|
||||
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
|
||||
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
|
||||
|
||||
**UI:**
|
||||
|
||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
||||
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
|
||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||
|
||||
---
|
||||
|
||||
@@ -273,29 +280,11 @@ In order to build llama.cpp you have three different options.
|
||||
|
||||
### Metal Build
|
||||
|
||||
Using Metal allows the computation to be executed on the GPU for Apple devices:
|
||||
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
||||
To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
|
||||
|
||||
- Using `make`:
|
||||
|
||||
```bash
|
||||
LLAMA_METAL=1 make
|
||||
```
|
||||
|
||||
- Using `CMake`:
|
||||
|
||||
```bash
|
||||
mkdir build-metal
|
||||
cd build-metal
|
||||
cmake -DLLAMA_METAL=ON ..
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
When built with Metal support, you can enable GPU inference with the `--gpu-layers|-ngl` command-line argument.
|
||||
Any value larger than 0 will offload the computation to the GPU. For example:
|
||||
|
||||
```bash
|
||||
./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
|
||||
```
|
||||
When built with Metal support, you can explicitly disable GPU inference with the `--gpu-layers|-ngl 0` command-line
|
||||
argument.
|
||||
|
||||
### MPI Build
|
||||
|
||||
@@ -458,6 +447,8 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
|
||||
- For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
|
||||
|
||||
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
|
||||
|
||||
- <details>
|
||||
<summary>Installing the OpenCL SDK from source</summary>
|
||||
|
||||
@@ -475,10 +466,27 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
```
|
||||
</details>
|
||||
|
||||
Installing CLBlast: it may be found in your operating system's packages.
|
||||
##### Installing CLBlast
|
||||
|
||||
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
|
||||
|
||||
Alternatively, they may be built from source.
|
||||
|
||||
- <details>
|
||||
<summary>If not, then installing from source:</summary>
|
||||
<summary>Windows:</summary>
|
||||
|
||||
```cmd
|
||||
set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
|
||||
git clone https://github.com/CNugteren/CLBlast.git
|
||||
mkdir CLBlast\build
|
||||
cd CLBlast\build
|
||||
cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
|
||||
cmake --build . --config Release
|
||||
cmake --install . --prefix C:/CLBlast
|
||||
```
|
||||
|
||||
- <details>
|
||||
<summary>Unix:</summary>
|
||||
|
||||
```sh
|
||||
git clone https://github.com/CNugteren/CLBlast.git
|
||||
@@ -492,21 +500,32 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
Where `/some/path` is where the built library will be installed (default is `/usr/local`).
|
||||
</details>
|
||||
|
||||
Building:
|
||||
##### Building Llama with CLBlast
|
||||
|
||||
- Build with make:
|
||||
```sh
|
||||
make LLAMA_CLBLAST=1
|
||||
```
|
||||
- CMake:
|
||||
- CMake (Unix):
|
||||
```sh
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
|
||||
cmake --build . --config Release
|
||||
```
|
||||
- CMake (Windows):
|
||||
```cmd
|
||||
set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
|
||||
cmake --build . --config Release
|
||||
cmake --install . --prefix C:/LlamaCPP
|
||||
```
|
||||
|
||||
Running:
|
||||
##### Running Llama with CLBlast
|
||||
|
||||
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
|
||||
|
||||
@@ -724,8 +743,6 @@ python3 convert.py pygmalion-7b/ --outtype q4_1
|
||||
- [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML)
|
||||
- [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML)
|
||||
- [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGML)
|
||||
- Specify `-eps 1e-5` for best generation quality
|
||||
- Specify `-gqa 8` for 70B models to work
|
||||
|
||||
### Verifying the model files
|
||||
|
||||
|
||||
140
ci/run.sh
140
ci/run.sh
@@ -196,17 +196,17 @@ function gg_run_open_llama_3b_v2 {
|
||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
@@ -233,6 +233,48 @@ function gg_run_open_llama_3b_v2 {
|
||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
|
||||
# lora
|
||||
function compare_ppl {
|
||||
qnt="$1"
|
||||
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
|
||||
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
|
||||
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
|
||||
return 20
|
||||
fi
|
||||
|
||||
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
|
||||
return 0
|
||||
}
|
||||
|
||||
path_lora="../models-mnt/open-llama/3B-v2/lora"
|
||||
path_shakespeare="../models-mnt/shakespeare"
|
||||
|
||||
shakespeare="${path_shakespeare}/shakespeare.txt"
|
||||
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
||||
|
||||
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
||||
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
||||
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
||||
|
||||
python3 ../convert-lora-to-ggml.py ${path_lora}
|
||||
|
||||
# f16
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
|
||||
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||
|
||||
# q8_0
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
|
||||
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||
|
||||
# q8_0 + f16 lora-base
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
||||
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
@@ -242,6 +284,7 @@ function gg_sum_open_llama_3b_v2 {
|
||||
gg_printf 'OpenLLaMA 3B-v2:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||
@@ -253,6 +296,11 @@ function gg_sum_open_llama_3b_v2 {
|
||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
||||
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
||||
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
||||
gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
|
||||
gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
||||
}
|
||||
|
||||
# open_llama_7b_v2
|
||||
@@ -310,17 +358,17 @@ function gg_run_open_llama_7b_v2 {
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/main --model ${model_f16} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
@@ -359,6 +407,48 @@ function gg_run_open_llama_7b_v2 {
|
||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
|
||||
# lora
|
||||
function compare_ppl {
|
||||
qnt="$1"
|
||||
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
|
||||
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
|
||||
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
|
||||
return 20
|
||||
fi
|
||||
|
||||
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
|
||||
return 0
|
||||
}
|
||||
|
||||
path_lora="../models-mnt/open-llama/7B-v2/lora"
|
||||
path_shakespeare="../models-mnt/shakespeare"
|
||||
|
||||
shakespeare="${path_shakespeare}/shakespeare.txt"
|
||||
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
||||
|
||||
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
||||
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
||||
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
||||
|
||||
python3 ../convert-lora-to-ggml.py ${path_lora}
|
||||
|
||||
# f16
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
|
||||
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||
|
||||
# currently not supported by the CUDA backend
|
||||
# q8_0
|
||||
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
|
||||
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
|
||||
#compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||
|
||||
# q8_0 + f16 lora-base
|
||||
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
||||
#compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
@@ -368,6 +458,7 @@ function gg_sum_open_llama_7b_v2 {
|
||||
gg_printf 'OpenLLaMA 7B-v2:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||
@@ -379,6 +470,11 @@ function gg_sum_open_llama_7b_v2 {
|
||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
||||
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
||||
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
||||
#gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
|
||||
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
||||
}
|
||||
|
||||
## main
|
||||
|
||||
14
codecov.yml
Normal file
14
codecov.yml
Normal file
@@ -0,0 +1,14 @@
|
||||
comment: off
|
||||
|
||||
coverage:
|
||||
status:
|
||||
project:
|
||||
default:
|
||||
target: auto
|
||||
threshold: 0
|
||||
base: auto
|
||||
patch:
|
||||
default:
|
||||
target: auto
|
||||
threshold: 0
|
||||
base: auto
|
||||
@@ -1,15 +1,21 @@
|
||||
#include "common.h"
|
||||
#include "build-info.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <iterator>
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
#include <unordered_set>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <iterator>
|
||||
#include <iostream>
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
#include <cinttypes>
|
||||
|
||||
#if defined(__APPLE__) && defined(__MACH__)
|
||||
#include <sys/types.h>
|
||||
@@ -18,12 +24,17 @@
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOMINMAX
|
||||
#ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
#endif
|
||||
#include <codecvt>
|
||||
#include <locale>
|
||||
#include <windows.h>
|
||||
#include <fcntl.h>
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
@@ -93,7 +104,6 @@ void process_escapes(std::string& input) {
|
||||
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
bool invalid_param = false;
|
||||
bool escape_prompt = false;
|
||||
std::string arg;
|
||||
gpt_params default_params;
|
||||
const std::string arg_prefix = "--";
|
||||
@@ -125,8 +135,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.prompt = argv[i];
|
||||
} else if (arg == "-e") {
|
||||
escape_prompt = true;
|
||||
} else if (arg == "-e" || arg == "--escape") {
|
||||
params.escape = true;
|
||||
} else if (arg == "--prompt-cache") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -295,6 +305,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.n_keep = std::stoi(argv[i]);
|
||||
} else if (arg == "--draft") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_draft = std::stoi(argv[i]);
|
||||
} else if (arg == "--chunks") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -307,6 +323,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.model = argv[i];
|
||||
} else if (arg == "-md" || arg == "--model-draft") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.model_draft = argv[i];
|
||||
} else if (arg == "-a" || arg == "--alias") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -415,6 +437,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.antiprompt.push_back(argv[i]);
|
||||
} else if (arg == "-ld" || arg == "--logdir") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.logdir = argv[i];
|
||||
|
||||
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
|
||||
params.logdir += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
} else if (arg == "--perplexity") {
|
||||
params.perplexity = true;
|
||||
} else if (arg == "--ppl-stride") {
|
||||
@@ -462,6 +494,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
}
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_print_usage();
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
exit(0);
|
||||
} else if (arg == "--random-prompt") {
|
||||
params.random_prompt = true;
|
||||
@@ -501,6 +536,25 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
std::istreambuf_iterator<char>(),
|
||||
std::back_inserter(params.grammar)
|
||||
);
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
// Parse args for logging parameters
|
||||
} else if ( log_param_single_parse( argv[i] ) ) {
|
||||
// Do nothing, log_param_single_parse automatically does it's thing
|
||||
// and returns if a match was found and parsed.
|
||||
} else if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) {
|
||||
// We have a matching known parameter requiring an argument,
|
||||
// now we need to check if there is anything after this argv
|
||||
// and flag invalid_param or parse it.
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
if( !log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i-1], argv[i]) ) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
// End of Parse args for logging parameters
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
@@ -520,7 +574,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (escape_prompt) {
|
||||
if (params.escape) {
|
||||
process_escapes(params.prompt);
|
||||
process_escapes(params.input_prefix);
|
||||
process_escapes(params.input_suffix);
|
||||
@@ -546,7 +600,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
fprintf(stdout, " -p PROMPT, --prompt PROMPT\n");
|
||||
fprintf(stdout, " prompt to start generation with (default: empty)\n");
|
||||
fprintf(stdout, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
|
||||
fprintf(stdout, " -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
|
||||
fprintf(stdout, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
|
||||
fprintf(stdout, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
|
||||
fprintf(stdout, " not supported with --interactive or other interactive options\n");
|
||||
@@ -596,6 +650,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||
fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
||||
fprintf(stdout, " --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||
fprintf(stdout, " --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
|
||||
fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||
if (llama_mlock_supported()) {
|
||||
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
@@ -627,6 +682,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
fprintf(stdout, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
|
||||
fprintf(stdout, " -md FNAME, --model-draft FNAME\n");
|
||||
fprintf(stdout, " draft model for speculative decoding (default: %s)\n", params.model.c_str());
|
||||
fprintf(stdout, " -ld LOGDIR, --logdir LOGDIR\n");
|
||||
fprintf(stdout, " path under which to save YAML logs (no logging if unset)\n");
|
||||
fprintf(stdout, "\n");
|
||||
}
|
||||
|
||||
@@ -658,7 +717,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
lparams.n_batch = params.n_batch;
|
||||
lparams.n_gpu_layers = params.n_gpu_layers;
|
||||
if (params.n_gpu_layers != -1) {
|
||||
lparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
lparams.main_gpu = params.main_gpu;
|
||||
lparams.tensor_split = params.tensor_split;
|
||||
lparams.low_vram = params.low_vram;
|
||||
@@ -708,6 +769,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||
params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
|
||||
}
|
||||
|
||||
{
|
||||
LOG("warming up the model with an empty run\n");
|
||||
|
||||
const std::vector<llama_token> tmp = { llama_token_bos(lctx), };
|
||||
llama_eval(lctx, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||
llama_reset_timings(lctx);
|
||||
}
|
||||
|
||||
return std::make_tuple(model, lctx);
|
||||
}
|
||||
|
||||
@@ -733,12 +802,12 @@ std::vector<llama_token> llama_tokenize(
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
||||
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
||||
std::vector<char> result(8, 0);
|
||||
const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
|
||||
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_token_to_str(ctx, token, result.data(), result.size());
|
||||
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
@@ -746,3 +815,447 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok
|
||||
|
||||
return std::string(result.data(), result.size());
|
||||
}
|
||||
|
||||
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||
const llama_token bos_id = llama_token_bos(ctx);
|
||||
|
||||
std::string piece;
|
||||
std::string result;
|
||||
|
||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||
piece = llama_token_to_piece(ctx, tokens[i]);
|
||||
|
||||
// remove the leading space of the first non-BOS token
|
||||
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
|
||||
piece = piece.substr(1);
|
||||
}
|
||||
|
||||
result += piece;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||
std::string piece;
|
||||
std::string result;
|
||||
|
||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||
piece = llama_token_to_piece(ctx, tokens[i]);
|
||||
|
||||
result += piece;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
//
|
||||
// Sampling utils
|
||||
//
|
||||
|
||||
llama_token llama_sample_token(
|
||||
struct llama_context * ctx,
|
||||
struct llama_context * ctx_guidance,
|
||||
struct llama_grammar * grammar,
|
||||
const struct gpt_params & params,
|
||||
const std::vector<llama_token> & last_tokens,
|
||||
std::vector<llama_token_data> & candidates,
|
||||
int idx) {
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
const float temp = params.temp;
|
||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float tfs_z = params.tfs_z;
|
||||
const float typical_p = params.typical_p;
|
||||
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
||||
const float repeat_penalty = params.repeat_penalty;
|
||||
const float alpha_presence = params.presence_penalty;
|
||||
const float alpha_frequency = params.frequency_penalty;
|
||||
const int mirostat = params.mirostat;
|
||||
const float mirostat_tau = params.mirostat_tau;
|
||||
const float mirostat_eta = params.mirostat_eta;
|
||||
const bool penalize_nl = params.penalize_nl;
|
||||
|
||||
llama_token id = 0;
|
||||
|
||||
float * logits = llama_get_logits(ctx) + idx * n_vocab;
|
||||
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
||||
logits[it->first] += it->second;
|
||||
}
|
||||
|
||||
candidates.clear();
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
if (ctx_guidance) {
|
||||
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
|
||||
}
|
||||
|
||||
// apply penalties
|
||||
if (!last_tokens.empty()) {
|
||||
const float nl_logit = logits[llama_token_nl(ctx)];
|
||||
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
|
||||
|
||||
llama_sample_repetition_penalty(ctx, &cur_p,
|
||||
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
||||
last_n_repeat, repeat_penalty);
|
||||
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
|
||||
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
||||
last_n_repeat, alpha_frequency, alpha_presence);
|
||||
|
||||
if (!penalize_nl) {
|
||||
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
||||
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
|
||||
cur_p.data[idx].logit = nl_logit;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (grammar != NULL) {
|
||||
llama_sample_grammar(ctx, &cur_p, grammar);
|
||||
}
|
||||
|
||||
if (temp <= 0) {
|
||||
// Greedy sampling
|
||||
id = llama_sample_token_greedy(ctx, &cur_p);
|
||||
} else {
|
||||
if (mirostat == 1) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
const int mirostat_m = 100;
|
||||
llama_sample_temperature(ctx, &cur_p, temp);
|
||||
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
||||
} else if (mirostat == 2) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
llama_sample_temperature(ctx, &cur_p, temp);
|
||||
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||
} else {
|
||||
// Temperature sampling
|
||||
llama_sample_top_k (ctx, &cur_p, top_k, 1);
|
||||
llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
|
||||
llama_sample_typical (ctx, &cur_p, typical_p, 1);
|
||||
llama_sample_top_p (ctx, &cur_p, top_p, 1);
|
||||
llama_sample_temperature(ctx, &cur_p, temp);
|
||||
|
||||
{
|
||||
const int n_top = 10;
|
||||
LOG("top %d candidates:\n", n_top);
|
||||
|
||||
for (int i = 0; i < n_top; i++) {
|
||||
const llama_token id = cur_p.data[i].id;
|
||||
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
|
||||
}
|
||||
}
|
||||
|
||||
id = llama_sample_token(ctx, &cur_p);
|
||||
|
||||
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
}
|
||||
// printf("`%d`", candidates_p.size);
|
||||
|
||||
if (grammar != NULL) {
|
||||
llama_grammar_accept_token(ctx, grammar, id);
|
||||
}
|
||||
|
||||
return id;
|
||||
}
|
||||
|
||||
//
|
||||
// YAML utils
|
||||
//
|
||||
|
||||
// returns true if successful, false otherwise
|
||||
bool create_directory_with_parents(const std::string & path) {
|
||||
#ifdef _WIN32
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
||||
std::wstring wpath = converter.from_bytes(path);
|
||||
|
||||
// if the path already exists, check whether it's a directory
|
||||
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
||||
if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t pos_slash = 0;
|
||||
|
||||
// process path from front to back, procedurally creating directories
|
||||
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
||||
const std::wstring subpath = wpath.substr(0, pos_slash);
|
||||
const wchar_t * test = subpath.c_str();
|
||||
|
||||
const bool success = CreateDirectoryW(test, NULL);
|
||||
if (!success) {
|
||||
const DWORD error = GetLastError();
|
||||
|
||||
// if the path already exists, ensure that it's a directory
|
||||
if (error == ERROR_ALREADY_EXISTS) {
|
||||
const DWORD attributes = GetFileAttributesW(subpath.c_str());
|
||||
if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
pos_slash += 1;
|
||||
}
|
||||
|
||||
return true;
|
||||
#else
|
||||
// if the path already exists, check whether it's a directory
|
||||
struct stat info;
|
||||
if (stat(path.c_str(), &info) == 0) {
|
||||
return S_ISDIR(info.st_mode);
|
||||
}
|
||||
|
||||
size_t pos_slash = 1; // skip leading slashes for directory creation
|
||||
|
||||
// process path from front to back, procedurally creating directories
|
||||
while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
|
||||
const std::string subpath = path.substr(0, pos_slash);
|
||||
struct stat info;
|
||||
|
||||
// if the path already exists, ensure that it's a directory
|
||||
if (stat(subpath.c_str(), &info) == 0) {
|
||||
if (!S_ISDIR(info.st_mode)) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
// create parent directories
|
||||
const int ret = mkdir(subpath.c_str(), 0755);
|
||||
if (ret != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
pos_slash += 1;
|
||||
}
|
||||
|
||||
return true;
|
||||
#endif // _WIN32
|
||||
}
|
||||
|
||||
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data) {
|
||||
if (data.empty()) {
|
||||
fprintf(stream, "%s:\n", prop_name);
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(stream, "%s: [", prop_name);
|
||||
for (size_t i = 0; i < data.size() - 1; ++i) {
|
||||
fprintf(stream, "%e, ", data[i]);
|
||||
}
|
||||
fprintf(stream, "%e]\n", data.back());
|
||||
}
|
||||
|
||||
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data) {
|
||||
if (data.empty()) {
|
||||
fprintf(stream, "%s:\n", prop_name);
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(stream, "%s: [", prop_name);
|
||||
for (size_t i = 0; i < data.size() - 1; ++i) {
|
||||
fprintf(stream, "%d, ", data[i]);
|
||||
}
|
||||
fprintf(stream, "%d]\n", data.back());
|
||||
}
|
||||
|
||||
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) {
|
||||
std::string data_str(data == NULL ? "" : data);
|
||||
|
||||
if (data_str.empty()) {
|
||||
fprintf(stream, "%s:\n", prop_name);
|
||||
return;
|
||||
}
|
||||
|
||||
size_t pos_start = 0;
|
||||
size_t pos_found = 0;
|
||||
|
||||
if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
|
||||
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
||||
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
||||
data_str = "\"" + data_str + "\"";
|
||||
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
if (data_str.find('\n') == std::string::npos) {
|
||||
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(stream, "%s: |\n", prop_name);
|
||||
while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
|
||||
fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
|
||||
pos_start = pos_found + 1;
|
||||
}
|
||||
}
|
||||
|
||||
std::string get_sortable_timestamp() {
|
||||
using clock = std::chrono::system_clock;
|
||||
|
||||
const clock::time_point current_time = clock::now();
|
||||
const time_t as_time_t = clock::to_time_t(current_time);
|
||||
char timestamp_no_ns[100];
|
||||
std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
|
||||
|
||||
const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
|
||||
current_time.time_since_epoch() % 1000000000).count();
|
||||
char timestamp_ns[11];
|
||||
snprintf(timestamp_ns, 11, "%09" PRId64, ns);
|
||||
|
||||
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
||||
}
|
||||
|
||||
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
||||
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
||||
|
||||
#ifdef NDEBUG
|
||||
fprintf(stream, "debug: false\n");
|
||||
#else
|
||||
fprintf(stream, "debug: true\n");
|
||||
#endif // NDEBUG
|
||||
|
||||
fprintf(stream, "model_desc: %s\n", model_desc);
|
||||
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(lctx));
|
||||
|
||||
#ifdef __OPTIMIZE__
|
||||
fprintf(stream, "optimize: true\n");
|
||||
#else
|
||||
fprintf(stream, "optimize: false\n");
|
||||
#endif // __OPTIMIZE__
|
||||
|
||||
fprintf(stream, "time: %s\n", timestamp.c_str());
|
||||
|
||||
fprintf(stream, "\n");
|
||||
fprintf(stream, "###############\n");
|
||||
fprintf(stream, "# User Inputs #\n");
|
||||
fprintf(stream, "###############\n");
|
||||
fprintf(stream, "\n");
|
||||
|
||||
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
||||
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
||||
dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
|
||||
fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
|
||||
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
||||
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
||||
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
||||
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
||||
fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false");
|
||||
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
||||
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
|
||||
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
|
||||
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
||||
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
||||
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
||||
|
||||
const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
|
||||
const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
||||
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
||||
|
||||
dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
||||
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
|
||||
dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
|
||||
fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
|
||||
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
||||
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
||||
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
||||
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
||||
|
||||
fprintf(stream, "logit_bias:\n");
|
||||
for (std::pair<llama_token, float> lb : params.logit_bias) {
|
||||
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
||||
continue;
|
||||
}
|
||||
fprintf(stream, " %d: %f", lb.first, lb.second);
|
||||
}
|
||||
|
||||
fprintf(stream, "lora: %s\n", params.lora_adapter.c_str());
|
||||
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
||||
fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
|
||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
|
||||
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
|
||||
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
|
||||
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
||||
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
|
||||
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
||||
fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
|
||||
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
||||
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
||||
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
||||
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
|
||||
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
||||
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
|
||||
fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
|
||||
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
|
||||
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
||||
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
||||
fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
|
||||
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
||||
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
||||
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
||||
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
||||
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
||||
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
||||
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
|
||||
|
||||
fprintf(stream, "reverse_prompt:\n");
|
||||
for (std::string ap : params.antiprompt) {
|
||||
size_t pos = 0;
|
||||
while ((pos = ap.find('\n', pos)) != std::string::npos) {
|
||||
ap.replace(pos, 1, "\\n");
|
||||
pos += 1;
|
||||
}
|
||||
|
||||
fprintf(stream, " - %s\n", ap.c_str());
|
||||
}
|
||||
|
||||
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
||||
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
||||
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
|
||||
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
||||
fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
|
||||
|
||||
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
|
||||
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
||||
|
||||
fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
|
||||
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
||||
fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
|
||||
fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
|
||||
fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
|
||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||
}
|
||||
|
||||
@@ -4,6 +4,9 @@
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#define LOG_NO_FILE_LINE_FUNCTION
|
||||
#include "log.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
@@ -11,6 +14,12 @@
|
||||
#include <unordered_map>
|
||||
#include <tuple>
|
||||
|
||||
#ifdef _WIN32
|
||||
#define DIRECTORY_SEPARATOR '\\'
|
||||
#else
|
||||
#define DIRECTORY_SEPARATOR '/'
|
||||
#endif // _WIN32
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
//
|
||||
@@ -23,8 +32,9 @@ struct gpt_params {
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
@@ -54,6 +64,7 @@ struct gpt_params {
|
||||
float cfg_scale = 1.f; // How strong is guidance
|
||||
|
||||
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
||||
std::string model_draft = ""; // draft model for speculative decoding
|
||||
std::string model_alias = "unknown"; // model alias
|
||||
std::string prompt = "";
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||
@@ -61,6 +72,7 @@ struct gpt_params {
|
||||
std::string input_suffix = ""; // string to suffix user inputs with
|
||||
std::string grammar = ""; // optional BNF-like grammar to constrain sampling
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
std::string logdir = ""; // directory in which to save YAML log files
|
||||
|
||||
std::string lora_adapter = ""; // lora adapter path
|
||||
std::string lora_base = ""; // base model path for the lora adapter
|
||||
@@ -82,6 +94,7 @@ struct gpt_params {
|
||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||
|
||||
bool embedding = false; // get only sentence embedding
|
||||
bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
||||
bool interactive_first = false; // wait for user input immediately
|
||||
bool multiline_input = false; // reverse the usage of `\`
|
||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||
@@ -116,11 +129,75 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||
// Vocab utils
|
||||
//
|
||||
|
||||
// tokenizes a string into a vector of tokens
|
||||
// should work similar to Python's `tokenizer.encode`
|
||||
std::vector<llama_token> llama_tokenize(
|
||||
struct llama_context * ctx,
|
||||
const std::string & text,
|
||||
bool add_bos);
|
||||
|
||||
std::string llama_token_to_str(
|
||||
// tokenizes a token into a piece
|
||||
// should work similar to Python's `tokenizer.id_to_piece`
|
||||
std::string llama_token_to_piece(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token);
|
||||
|
||||
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
|
||||
// that takes into account the tokenizer type and decides how to handle the leading space
|
||||
//
|
||||
// detokenizes a vector of tokens into a string
|
||||
// should work similar to Python's `tokenizer.decode`
|
||||
// removes the leading space from the first non-BOS token
|
||||
std::string llama_detokenize_spm(
|
||||
llama_context * ctx,
|
||||
const std::vector<llama_token> & tokens);
|
||||
|
||||
// detokenizes a vector of tokens into a string
|
||||
// should work similar to Python's `tokenizer.decode`
|
||||
std::string llama_detokenize_bpe(
|
||||
llama_context * ctx,
|
||||
const std::vector<llama_token> & tokens);
|
||||
|
||||
//
|
||||
// Sampling utils
|
||||
//
|
||||
|
||||
// this is a common sampling function used across the examples for convenience
|
||||
// it can serve as a starting point for implementing your own sampling function
|
||||
//
|
||||
// required:
|
||||
// - ctx: context to use for sampling
|
||||
// - params: sampling parameters
|
||||
//
|
||||
// optional:
|
||||
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
|
||||
// - grammar: grammar to use for sampling, ignore if NULL
|
||||
// - last_tokens: needed for repetition penalty, ignore if empty
|
||||
// - idx: sample from llama_get_logits(ctx) + idx * n_vocab
|
||||
//
|
||||
// returns:
|
||||
// - token: sampled token
|
||||
// - candidates: vector of candidate tokens
|
||||
//
|
||||
llama_token llama_sample_token(
|
||||
struct llama_context * ctx,
|
||||
struct llama_context * ctx_guidance,
|
||||
struct llama_grammar * grammar,
|
||||
const struct gpt_params & params,
|
||||
const std::vector<llama_token> & last_tokens,
|
||||
std::vector<llama_token_data> & candidates,
|
||||
int idx = 0);
|
||||
|
||||
//
|
||||
// YAML utils
|
||||
//
|
||||
|
||||
bool create_directory_with_parents(const std::string & path);
|
||||
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
|
||||
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
|
||||
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
|
||||
std::string get_sortable_timestamp();
|
||||
|
||||
void dump_non_result_info_yaml(
|
||||
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||
|
||||
@@ -235,6 +235,7 @@ namespace console {
|
||||
|
||||
int estimateWidth(char32_t codepoint) {
|
||||
#if defined(_WIN32)
|
||||
(void)codepoint;
|
||||
return 1;
|
||||
#else
|
||||
return wcwidth(codepoint);
|
||||
|
||||
643
common/log.h
Normal file
643
common/log.h
Normal file
@@ -0,0 +1,643 @@
|
||||
#pragma once
|
||||
|
||||
#include <chrono>
|
||||
#include <cstring>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
|
||||
// --------------------------------
|
||||
//
|
||||
// Basic usage:
|
||||
//
|
||||
// --------
|
||||
//
|
||||
// The LOG() and LOG_TEE() macros are ready to go by default
|
||||
// they do not require any initialization.
|
||||
//
|
||||
// LOGLN() and LOG_TEELN() are variants which automatically
|
||||
// include \n character at the end of the log string.
|
||||
//
|
||||
// LOG() behaves exactly like printf, by default writing to a logfile.
|
||||
// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
|
||||
//
|
||||
// Default logfile is named
|
||||
// "llama.<threadID>.log"
|
||||
// Default LOG_TEE() secondary output target is
|
||||
// stderr
|
||||
//
|
||||
// Logs can be dynamically disabled or enabled using functions:
|
||||
// log_disable()
|
||||
// and
|
||||
// log_enable()
|
||||
//
|
||||
// A log target can be changed with:
|
||||
// log_set_target( string )
|
||||
// creating and opening, or re-opening a file by string filename
|
||||
// or
|
||||
// log_set_target( FILE* )
|
||||
// allowing to point at stderr, stdout, or any valid FILE* file handler.
|
||||
//
|
||||
// --------
|
||||
//
|
||||
// End of Basic usage.
|
||||
//
|
||||
// --------------------------------
|
||||
|
||||
// Specifies a log target.
|
||||
// default uses log_handler() with "llama.log" log file
|
||||
// this can be changed, by defining LOG_TARGET
|
||||
// like so:
|
||||
//
|
||||
// #define LOG_TARGET (a valid FILE*)
|
||||
// #include "log.h"
|
||||
//
|
||||
// or it can be simply redirected to stdout or stderr
|
||||
// like so:
|
||||
//
|
||||
// #define LOG_TARGET stderr
|
||||
// #include "log.h"
|
||||
//
|
||||
// The log target can also be redirected to a diffrent function
|
||||
// like so:
|
||||
//
|
||||
// #define LOG_TARGET log_handler_diffrent()
|
||||
// #include "log.h"
|
||||
//
|
||||
// FILE* log_handler_diffrent()
|
||||
// {
|
||||
// return stderr;
|
||||
// }
|
||||
//
|
||||
// or:
|
||||
//
|
||||
// #define LOG_TARGET log_handler_another_one("somelog.log")
|
||||
// #include "log.h"
|
||||
//
|
||||
// FILE* log_handler_another_one(char*filename)
|
||||
// {
|
||||
// static FILE* logfile = nullptr;
|
||||
// (...)
|
||||
// if( !logfile )
|
||||
// {
|
||||
// fopen(...)
|
||||
// }
|
||||
// (...)
|
||||
// return logfile
|
||||
// }
|
||||
//
|
||||
#ifndef LOG_TARGET
|
||||
#define LOG_TARGET log_handler()
|
||||
#endif
|
||||
|
||||
#ifndef LOG_TEE_TARGET
|
||||
#define LOG_TEE_TARGET stderr
|
||||
#endif
|
||||
|
||||
// Utility to obtain "pid" like unique process id and use it when creating log files.
|
||||
inline std::string log_get_pid()
|
||||
{
|
||||
static std::string pid;
|
||||
if (pid.empty())
|
||||
{
|
||||
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
||||
// it's not the same as "pid" but is unique enough to solve multiple instances
|
||||
// trying to write to the same log.
|
||||
std::stringstream ss;
|
||||
ss << std::this_thread::get_id();
|
||||
pid = ss.str();
|
||||
}
|
||||
|
||||
return pid;
|
||||
}
|
||||
|
||||
// Utility function for generating log file names with unique id based on thread id.
|
||||
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
|
||||
// where the number is a runtime id of the current thread.
|
||||
|
||||
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
|
||||
{
|
||||
std::stringstream buf;
|
||||
|
||||
buf << log_file_basename;
|
||||
buf << ".";
|
||||
buf << log_get_pid();
|
||||
buf << ".";
|
||||
buf << log_file_extension;
|
||||
|
||||
return buf.str();
|
||||
}
|
||||
|
||||
#ifndef LOG_DEFAULT_FILE_NAME
|
||||
#define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
|
||||
#endif
|
||||
|
||||
// Utility for turning #define values into string literals
|
||||
// so we can have a define for stderr and
|
||||
// we can print "stderr" instead of literal stderr, etc.
|
||||
#define LOG_STRINGIZE1(s) #s
|
||||
#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
|
||||
|
||||
#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
|
||||
|
||||
// Allows disabling timestamps.
|
||||
// in order to disable, define LOG_NO_TIMESTAMPS
|
||||
// like so:
|
||||
//
|
||||
// #define LOG_NO_TIMESTAMPS
|
||||
// #include "log.h"
|
||||
//
|
||||
#ifndef LOG_NO_TIMESTAMPS
|
||||
#ifndef _MSC_VER
|
||||
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
|
||||
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
||||
#else
|
||||
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
|
||||
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
||||
#endif
|
||||
#else
|
||||
#define LOG_TIMESTAMP_FMT "%s"
|
||||
#define LOG_TIMESTAMP_VAL ,""
|
||||
#endif
|
||||
|
||||
#ifdef LOG_TEE_TIMESTAMPS
|
||||
#ifndef _MSC_VER
|
||||
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
|
||||
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
||||
#else
|
||||
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
|
||||
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
||||
#endif
|
||||
#else
|
||||
#define LOG_TEE_TIMESTAMP_FMT "%s"
|
||||
#define LOG_TEE_TIMESTAMP_VAL ,""
|
||||
#endif
|
||||
|
||||
// Allows disabling file/line/function prefix
|
||||
// in order to disable, define LOG_NO_FILE_LINE_FUNCTION
|
||||
// like so:
|
||||
//
|
||||
// #define LOG_NO_FILE_LINE_FUNCTION
|
||||
// #include "log.h"
|
||||
//
|
||||
#ifndef LOG_NO_FILE_LINE_FUNCTION
|
||||
#ifndef _MSC_VER
|
||||
#define LOG_FLF_FMT "[%24s:%5d][%24s] "
|
||||
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#else
|
||||
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
|
||||
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#endif
|
||||
#else
|
||||
#define LOG_FLF_FMT "%s"
|
||||
#define LOG_FLF_VAL ,""
|
||||
#endif
|
||||
|
||||
#ifdef LOG_TEE_FILE_LINE_FUNCTION
|
||||
#ifndef _MSC_VER
|
||||
#define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
|
||||
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#else
|
||||
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
|
||||
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#endif
|
||||
#else
|
||||
#define LOG_TEE_FLF_FMT "%s"
|
||||
#define LOG_TEE_FLF_VAL ,""
|
||||
#endif
|
||||
|
||||
// Utility for synchronizing log configuration state
|
||||
// since std::optional was introduced only in c++17
|
||||
enum LogTriState
|
||||
{
|
||||
LogTriStateSame,
|
||||
LogTriStateFalse,
|
||||
LogTriStateTrue
|
||||
};
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
// USE LOG() INSTEAD
|
||||
//
|
||||
#ifndef _MSC_VER
|
||||
#define LOG_IMPL(str, ...) \
|
||||
{ \
|
||||
if (LOG_TARGET != nullptr) \
|
||||
{ \
|
||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
|
||||
fflush(LOG_TARGET); \
|
||||
} \
|
||||
}
|
||||
#else
|
||||
#define LOG_IMPL(str, ...) \
|
||||
{ \
|
||||
if (LOG_TARGET != nullptr) \
|
||||
{ \
|
||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
|
||||
fflush(LOG_TARGET); \
|
||||
} \
|
||||
}
|
||||
#endif
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
// USE LOG_TEE() INSTEAD
|
||||
//
|
||||
#ifndef _MSC_VER
|
||||
#define LOG_TEE_IMPL(str, ...) \
|
||||
{ \
|
||||
if (LOG_TARGET != nullptr) \
|
||||
{ \
|
||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
|
||||
fflush(LOG_TARGET); \
|
||||
} \
|
||||
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
|
||||
{ \
|
||||
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
|
||||
fflush(LOG_TEE_TARGET); \
|
||||
} \
|
||||
}
|
||||
#else
|
||||
#define LOG_TEE_IMPL(str, ...) \
|
||||
{ \
|
||||
if (LOG_TARGET != nullptr) \
|
||||
{ \
|
||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
|
||||
fflush(LOG_TARGET); \
|
||||
} \
|
||||
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
|
||||
{ \
|
||||
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
|
||||
fflush(LOG_TEE_TARGET); \
|
||||
} \
|
||||
}
|
||||
#endif
|
||||
|
||||
// The '\0' as a last argument, is a trick to bypass the silly
|
||||
// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
|
||||
// so we can have a single macro which can be called just like printf.
|
||||
|
||||
// Main LOG macro.
|
||||
// behaves like printf, and supports arguments the exact same way.
|
||||
//
|
||||
#ifndef _MSC_VER
|
||||
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
|
||||
#else
|
||||
#define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
|
||||
#endif
|
||||
|
||||
// Main TEE macro.
|
||||
// does the same as LOG
|
||||
// and
|
||||
// simultaneously writes stderr.
|
||||
//
|
||||
// Secondary target can be changed just like LOG_TARGET
|
||||
// by defining LOG_TEE_TARGET
|
||||
//
|
||||
#ifndef _MSC_VER
|
||||
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
|
||||
#else
|
||||
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
|
||||
#endif
|
||||
|
||||
// LOG macro variants with auto endline.
|
||||
#ifndef _MSC_VER
|
||||
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
|
||||
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
|
||||
#else
|
||||
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n")
|
||||
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
|
||||
#endif
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
|
||||
{
|
||||
static bool _initialized{false};
|
||||
static bool _disabled{(filename.empty() && target == nullptr)};
|
||||
static std::string log_current_filename{filename};
|
||||
static FILE *log_current_target{target};
|
||||
static FILE *logfile = nullptr;
|
||||
|
||||
if (change)
|
||||
{
|
||||
if (disable == LogTriStateTrue)
|
||||
{
|
||||
// Disable primary target
|
||||
_disabled = true;
|
||||
}
|
||||
// If previously disabled, only enable, and keep previous target
|
||||
else if (disable == LogTriStateFalse)
|
||||
{
|
||||
_disabled = false;
|
||||
}
|
||||
// Otherwise, process the arguments
|
||||
else if (log_current_filename != filename || log_current_target != target)
|
||||
{
|
||||
_initialized = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (_disabled)
|
||||
{
|
||||
// Log is disabled
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (_initialized)
|
||||
{
|
||||
// with fallback in case something went wrong
|
||||
return logfile ? logfile : stderr;
|
||||
}
|
||||
|
||||
// do the (re)initialization
|
||||
if (target != nullptr)
|
||||
{
|
||||
if (logfile != nullptr && logfile != stdout && logfile != stderr)
|
||||
{
|
||||
fclose(logfile);
|
||||
}
|
||||
|
||||
log_current_filename = LOG_DEFAULT_FILE_NAME;
|
||||
log_current_target = target;
|
||||
|
||||
logfile = target;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (log_current_filename != filename)
|
||||
{
|
||||
if (logfile != nullptr && logfile != stdout && logfile != stderr)
|
||||
{
|
||||
fclose(logfile);
|
||||
}
|
||||
}
|
||||
|
||||
logfile = fopen(filename.c_str(), "w");
|
||||
}
|
||||
|
||||
if (!logfile)
|
||||
{
|
||||
// Verify whether the file was opened, otherwise fallback to stderr
|
||||
logfile = stderr;
|
||||
|
||||
fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
|
||||
fflush(stderr);
|
||||
|
||||
// At this point we let the init flag be to true below, and let the target fallback to stderr
|
||||
// otherwise we would repeatedly fopen() which was already unsuccessful
|
||||
}
|
||||
|
||||
_initialized = true;
|
||||
|
||||
return logfile ? logfile : stderr;
|
||||
}
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
|
||||
{
|
||||
return log_handler1_impl(change, disable, filename, target);
|
||||
}
|
||||
|
||||
// Disables logs entirely at runtime.
|
||||
// Makes LOG() and LOG_TEE() produce no output,
|
||||
// untill enabled back.
|
||||
#define log_disable() log_disable_impl()
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline FILE *log_disable_impl()
|
||||
{
|
||||
return log_handler1_impl(true, LogTriStateTrue);
|
||||
}
|
||||
|
||||
// Enables logs at runtime.
|
||||
#define log_enable() log_enable_impl()
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline FILE *log_enable_impl()
|
||||
{
|
||||
return log_handler1_impl(true, LogTriStateFalse);
|
||||
}
|
||||
|
||||
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
|
||||
#define log_set_target(target) log_set_target_impl(target)
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
|
||||
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline FILE *log_handler() { return log_handler1_impl(); }
|
||||
|
||||
inline void log_test()
|
||||
{
|
||||
log_disable();
|
||||
LOG("01 Hello World to nobody, because logs are disabled!\n")
|
||||
log_enable();
|
||||
LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET))
|
||||
LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n")
|
||||
log_set_target(stderr);
|
||||
LOG("04 Hello World to stderr!\n")
|
||||
LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n")
|
||||
log_set_target(LOG_DEFAULT_FILE_NAME);
|
||||
LOG("06 Hello World to default log file!\n")
|
||||
log_set_target(stdout);
|
||||
LOG("07 Hello World to stdout!\n")
|
||||
log_set_target(LOG_DEFAULT_FILE_NAME);
|
||||
LOG("08 Hello World to default log file again!\n")
|
||||
log_disable();
|
||||
LOG("09 Hello World _1_ into the void!\n")
|
||||
log_enable();
|
||||
LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n")
|
||||
log_disable();
|
||||
log_set_target("llama.anotherlog.log");
|
||||
LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n")
|
||||
log_enable();
|
||||
LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n")
|
||||
log_set_target("llama.yetanotherlog.log");
|
||||
LOG("13 Hello World this time in yet new file?\n")
|
||||
log_set_target(log_filename_generator("llama_autonamed", "log"));
|
||||
LOG("14 Hello World in log with generated filename!\n")
|
||||
#ifdef _MSC_VER
|
||||
LOG_TEE("15 Hello msvc TEE without arguments\n")
|
||||
LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test")
|
||||
LOG_TEELN("17 Hello msvc TEELN without arguments\n")
|
||||
LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test")
|
||||
LOG("19 Hello msvc LOG without arguments\n")
|
||||
LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test")
|
||||
LOGLN("21 Hello msvc LOGLN without arguments\n")
|
||||
LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test")
|
||||
#endif
|
||||
}
|
||||
|
||||
inline bool log_param_single_parse(const std::string & param)
|
||||
{
|
||||
if ( param == "--log-test")
|
||||
{
|
||||
log_test();
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( param == "--log-disable")
|
||||
{
|
||||
log_disable();
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( param == "--log-enable")
|
||||
{
|
||||
log_enable();
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
|
||||
{
|
||||
if ( param == "--log-file")
|
||||
{
|
||||
if (!check_but_dont_parse)
|
||||
{
|
||||
log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
inline void log_print_usage()
|
||||
{
|
||||
fprintf(stdout, "log options:\n");
|
||||
/* format
|
||||
fprintf(stdout, " -h, --help show this help message and exit\n");*/
|
||||
/* spacing
|
||||
fprintf(stdout, "__-param----------------Description\n");*/
|
||||
fprintf(stdout, " --log-test Run simple logging test\n");
|
||||
fprintf(stdout, " --log-disable Disable trace logs\n");
|
||||
fprintf(stdout, " --log-enable Enable trace logs\n");
|
||||
fprintf(stdout, " --log-file Specify a log filename (without extension)\n");
|
||||
fprintf(stdout, " Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
|
||||
}
|
||||
|
||||
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline void log_dump_cmdline_impl(int argc, char **argv)
|
||||
{
|
||||
std::stringstream buf;
|
||||
for (int i = 0; i < argc; ++i)
|
||||
{
|
||||
if (std::string(argv[i]).find(' ') != std::string::npos)
|
||||
{
|
||||
buf << " \"" << argv[i] <<"\"";
|
||||
}
|
||||
else
|
||||
{
|
||||
buf << " " << argv[i];
|
||||
}
|
||||
}
|
||||
LOGLN("Cmd:%s", buf.str().c_str())
|
||||
}
|
||||
|
||||
#define log_tostr(var) log_var_to_string_impl(var).c_str()
|
||||
|
||||
inline std::string log_var_to_string_impl(bool var)
|
||||
{
|
||||
return var ? "true" : "false";
|
||||
}
|
||||
|
||||
inline std::string log_var_to_string_impl(std::string var)
|
||||
{
|
||||
return var;
|
||||
}
|
||||
|
||||
inline std::string log_var_to_string_impl(const std::vector<int> & var)
|
||||
{
|
||||
std::stringstream buf;
|
||||
buf << "[ ";
|
||||
bool first = true;
|
||||
for (auto e : var)
|
||||
{
|
||||
if (first)
|
||||
{
|
||||
first = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
buf << ", ";
|
||||
}
|
||||
buf << std::to_string(e);
|
||||
}
|
||||
buf << " ]";
|
||||
|
||||
return buf.str();
|
||||
}
|
||||
|
||||
#define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens) \
|
||||
[&tokens, &ctx]() \
|
||||
{ \
|
||||
std::stringstream buf; \
|
||||
buf << "[ "; \
|
||||
\
|
||||
bool first = true; \
|
||||
for (const auto &token : tokens) \
|
||||
{ \
|
||||
if (!first) \
|
||||
buf << ", "; \
|
||||
else \
|
||||
first = false; \
|
||||
\
|
||||
auto detokenized = llama_token_to_piece(ctx, token); \
|
||||
\
|
||||
detokenized.erase( \
|
||||
std::remove_if( \
|
||||
detokenized.begin(), \
|
||||
detokenized.end(), \
|
||||
[](const unsigned char c) { return !std::isprint(c); }), \
|
||||
detokenized.end()); \
|
||||
\
|
||||
buf \
|
||||
<< "'" << detokenized << "'" \
|
||||
<< ":" << std::to_string(token); \
|
||||
} \
|
||||
buf << " ]"; \
|
||||
\
|
||||
return buf.str(); \
|
||||
}() \
|
||||
.c_str()
|
||||
|
||||
#ifdef LOG_DISABLE_LOGS
|
||||
|
||||
#undef LOG
|
||||
#define LOG(...) // dummy stub
|
||||
#undef LOGLN
|
||||
#define LOGLN(...) // dummy stub
|
||||
|
||||
#undef LOG_TEE
|
||||
#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
|
||||
|
||||
#undef LOG_TEELN
|
||||
#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
|
||||
|
||||
#undef LOG_DISABLE
|
||||
#define LOG_DISABLE() // dummy stub
|
||||
|
||||
#undef LOG_ENABLE
|
||||
#define LOG_ENABLE() // dummy stub
|
||||
|
||||
#undef LOG_ENABLE
|
||||
#define LOG_ENABLE() // dummy stub
|
||||
|
||||
#undef LOG_SET_TARGET
|
||||
#define LOG_SET_TARGET(...) // dummy stub
|
||||
|
||||
#undef LOG_DUMP_CMDLINE
|
||||
#define LOG_DUMP_CMDLINE(...) // dummy stub
|
||||
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
@@ -1,17 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
# HF falcon--> gguf conversion
|
||||
|
||||
import gguf
|
||||
import os
|
||||
import sys
|
||||
import struct
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer # type: ignore[import]
|
||||
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
from typing import Any, List
|
||||
from pathlib import Path
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
def bytes_to_unicode():
|
||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||
@@ -32,11 +39,10 @@ def bytes_to_unicode():
|
||||
bs.append(b)
|
||||
cs.append(2**8+n)
|
||||
n += 1
|
||||
cs = [chr(n) for n in cs]
|
||||
return dict(zip(bs, cs))
|
||||
return dict(zip(bs, (chr(n) for n in cs)))
|
||||
|
||||
|
||||
def count_model_parts(dir_model: str) -> int:
|
||||
def count_model_parts(dir_model: Path) -> int:
|
||||
num_parts = 0
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith("pytorch_model-"):
|
||||
@@ -47,17 +53,22 @@ def count_model_parts(dir_model: str) -> int:
|
||||
return num_parts
|
||||
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
|
||||
print(" ftype == 0 -> float32")
|
||||
print(" ftype == 1 -> float16")
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
||||
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
|
||||
dir_model = args.model
|
||||
ftype = args.ftype
|
||||
if not dir_model.is_dir():
|
||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# output in the same directory as the model
|
||||
dir_model = sys.argv[1]
|
||||
last_dir = os.path.basename(os.path.normpath(dir_model))
|
||||
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
@@ -65,25 +76,21 @@ last_dir = os.path.basename(os.path.normpath(dir_model))
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
ftype = 1
|
||||
if len(sys.argv) > 2:
|
||||
ftype = int(sys.argv[2])
|
||||
if ftype < 0 or ftype > 1:
|
||||
print("Invalid ftype: " + str(ftype))
|
||||
if args.outfile is not None:
|
||||
fname_out = args.outfile
|
||||
else:
|
||||
# output in the same directory as the model by default
|
||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
||||
|
||||
sys.exit(1)
|
||||
print("gguf: loading model "+dir_model.name)
|
||||
|
||||
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
|
||||
|
||||
print("gguf: loading model "+last_dir)
|
||||
|
||||
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
|
||||
if hparams["architectures"][0] != "RWForCausalLM":
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
|
||||
sys.exit()
|
||||
sys.exit(1)
|
||||
|
||||
# get number of model parts
|
||||
num_parts = count_model_parts(dir_model)
|
||||
@@ -113,77 +120,58 @@ gguf_writer.add_file_type(ftype)
|
||||
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: List[str] = []
|
||||
scores: List[float] = []
|
||||
toktypes: List[int] = []
|
||||
merges: List[str] = []
|
||||
tokens: list[bytearray] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
||||
if not tokenizer_json_file.is_file():
|
||||
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if Path(dir_model + "/tokenizer.json").is_file():
|
||||
# gpt2 tokenizer
|
||||
gguf_writer.add_tokenizer_model("gpt2")
|
||||
# gpt2 tokenizer
|
||||
gguf_writer.add_tokenizer_model("gpt2")
|
||||
|
||||
print("gguf: get gpt2 tokenizer merges")
|
||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
|
||||
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
merges = tokenizer_json["model"]["merges"]
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
|
||||
gguf_writer.add_token_merges(merges)
|
||||
vocab_size = len(tokenizer_json["model"]["vocab"])
|
||||
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
|
||||
vocab_size = len(tokenizer_json["model"]["vocab"])
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
byte_encoder = bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
for i in range(vocab_size):
|
||||
if i in reverse_vocab:
|
||||
try:
|
||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||
except KeyError:
|
||||
text = bytearray()
|
||||
for c in reverse_vocab[i]:
|
||||
if ord(c) < 256: # single byte character
|
||||
text.append(byte_decoder[ord(c)])
|
||||
else: # multibyte special token character
|
||||
text.extend(c.encode('utf-8'))
|
||||
else:
|
||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
||||
pad_token = f"[PAD{i}]".encode("utf8")
|
||||
text = bytearray(pad_token)
|
||||
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
byte_encoder = bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
tokens.append(text)
|
||||
scores.append(0.0) # dymmy
|
||||
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i in reverse_vocab:
|
||||
try:
|
||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||
except KeyError:
|
||||
text = bytearray()
|
||||
for c in reverse_vocab[i]:
|
||||
if ord(c) < 256: # single byte character
|
||||
text.append(byte_decoder[ord(c)])
|
||||
else: # multibyte special token character
|
||||
text.extend(c.encode('utf-8'))
|
||||
else:
|
||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
||||
pad_token = f"[PAD{i}]".encode("utf8")
|
||||
text = bytearray(pad_token)
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(0.0) # dymmy
|
||||
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
print("gguf: get special token ids")
|
||||
# Look for special tokens in config.json
|
||||
|
||||
if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
|
||||
gguf_writer.add_bos_token_id(hparams["bos_token_id"])
|
||||
|
||||
if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
|
||||
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
|
||||
|
||||
if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
|
||||
gguf_writer.add_unk_token_id(hparams["unk_token_id"])
|
||||
|
||||
if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
|
||||
gguf_writer.add_sep_token_id(hparams["sep_token_id"])
|
||||
|
||||
if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
|
||||
gguf_writer.add_pad_token_id(hparams["pad_token_id"])
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
@@ -199,15 +187,17 @@ head_dim = hparams["hidden_size"] // n_head
|
||||
print("gguf: get tensor metadata")
|
||||
|
||||
if num_parts == 0:
|
||||
part_names = ("pytorch_model.bin",)
|
||||
part_names = iter(("pytorch_model.bin",))
|
||||
else:
|
||||
part_names = (
|
||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||
)
|
||||
|
||||
for part_name in part_names:
|
||||
if args.vocab_only:
|
||||
break
|
||||
print("gguf: loading model part '" + part_name + "'")
|
||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
||||
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part[name]
|
||||
@@ -238,11 +228,8 @@ for part_name in part_names:
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
if name.endswith(".weight") and name[:-7] in tensor_map:
|
||||
name = tensor_map[name[:-7]] + ".weight"
|
||||
elif name.endswith(".bias") and name[:-5] in tensor_map:
|
||||
name = tensor_map[name[:-5]] + ".bias"
|
||||
else:
|
||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
|
||||
@@ -261,19 +248,20 @@ for part_name in part_names:
|
||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
|
||||
gguf_writer.add_tensor(name, data)
|
||||
gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
if not args.vocab_only:
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
print("gguf: model successfully exported to '" + fname_out + "'")
|
||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
||||
print("")
|
||||
|
||||
@@ -1,17 +1,23 @@
|
||||
#!/usr/bin/env python3
|
||||
# HF gptneox--> gguf conversion
|
||||
|
||||
import gguf
|
||||
import os
|
||||
import sys
|
||||
import struct
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer # type: ignore[import]
|
||||
|
||||
from typing import Any, List
|
||||
from pathlib import Path
|
||||
from transformers import AutoTokenizer
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||
|
||||
@@ -34,11 +40,10 @@ def bytes_to_unicode():
|
||||
bs.append(b)
|
||||
cs.append(2**8+n)
|
||||
n += 1
|
||||
cs = [chr(n) for n in cs]
|
||||
return dict(zip(bs, cs))
|
||||
return dict(zip(bs, (chr(n) for n in cs)))
|
||||
|
||||
|
||||
def count_model_parts(dir_model: str) -> int:
|
||||
def count_model_parts(dir_model: Path) -> int:
|
||||
num_parts = 0
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith("pytorch_model-"):
|
||||
@@ -49,17 +54,22 @@ def count_model_parts(dir_model: str) -> int:
|
||||
return num_parts
|
||||
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
|
||||
print(" ftype == 0 -> float32")
|
||||
print(" ftype == 1 -> float16")
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
||||
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
|
||||
dir_model = args.model
|
||||
ftype = args.ftype
|
||||
if not dir_model.is_dir():
|
||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# output in the same directory as the model
|
||||
dir_model = sys.argv[1]
|
||||
last_dir = os.path.basename(os.path.normpath(dir_model))
|
||||
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
@@ -67,19 +77,15 @@ last_dir = os.path.basename(os.path.normpath(dir_model))
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
ftype = 1
|
||||
if len(sys.argv) > 2:
|
||||
ftype = int(sys.argv[2])
|
||||
if ftype < 0 or ftype > 1:
|
||||
print("Invalid ftype: " + str(ftype))
|
||||
if args.outfile is not None:
|
||||
fname_out = args.outfile
|
||||
else:
|
||||
# output in the same directory as the model by default
|
||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
||||
|
||||
sys.exit(1)
|
||||
print("gguf: loading model "+dir_model.name)
|
||||
|
||||
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
|
||||
|
||||
print("gguf: loading model "+last_dir)
|
||||
|
||||
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
|
||||
if hparams["architectures"][0] != "GPTNeoXForCausalLM":
|
||||
@@ -97,7 +103,7 @@ print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
||||
gguf_writer.add_name(last_dir)
|
||||
gguf_writer.add_name(dir_model.name)
|
||||
gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
@@ -111,86 +117,52 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
|
||||
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: List[str] = []
|
||||
merges: List[str] = []
|
||||
tokens: list[bytearray] = []
|
||||
|
||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
||||
if not tokenizer_json_file.is_file():
|
||||
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if Path(dir_model + "/tokenizer.json").is_file():
|
||||
# gpt2 tokenizer
|
||||
gguf_writer.add_tokenizer_model("gpt2")
|
||||
# gpt2 tokenizer
|
||||
gguf_writer.add_tokenizer_model("gpt2")
|
||||
|
||||
print("gguf: get gpt2 tokenizer merges")
|
||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
|
||||
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
merges = tokenizer_json["model"]["merges"]
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
|
||||
gguf_writer.add_token_merges(merges)
|
||||
vocab_size = len(tokenizer_json["model"]["vocab"])
|
||||
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
|
||||
vocab_size = len(tokenizer_json["model"]["vocab"])
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
byte_encoder = bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
for i in range(vocab_size):
|
||||
if i in reverse_vocab:
|
||||
try:
|
||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||
except KeyError:
|
||||
text = bytearray()
|
||||
for c in reverse_vocab[i]:
|
||||
if ord(c) < 256: # single byte character
|
||||
text.append(byte_decoder[ord(c)])
|
||||
else: # multibyte special token character
|
||||
text.extend(c.encode('utf-8'))
|
||||
else:
|
||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
||||
pad_token = f"[PAD{i}]".encode("utf8")
|
||||
text = bytearray(pad_token)
|
||||
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
byte_encoder = bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
tokens.append(text)
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i in reverse_vocab:
|
||||
try:
|
||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||
except KeyError:
|
||||
text = bytearray()
|
||||
for c in reverse_vocab[i]:
|
||||
if ord(c) < 256: # single byte character
|
||||
text.append(byte_decoder[ord(c)])
|
||||
else: # multibyte special token character
|
||||
text.extend(c.encode('utf-8'))
|
||||
else:
|
||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
||||
pad_token = f"[PAD{i}]".encode("utf8")
|
||||
text = bytearray(pad_token)
|
||||
|
||||
tokens.append(text)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
|
||||
if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
|
||||
print("gguf: get special token ids")
|
||||
|
||||
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_config = json.load(f)
|
||||
|
||||
# find special token ids
|
||||
|
||||
if "bos_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["bos_token"]:
|
||||
gguf_writer.add_bos_token_id(key["id"])
|
||||
|
||||
if "eos_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["eos_token"]:
|
||||
gguf_writer.add_eos_token_id(key["id"])
|
||||
|
||||
if "unk_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["unk_token"]:
|
||||
gguf_writer.add_unk_token_id(key["id"])
|
||||
|
||||
if "sep_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["sep_token"]:
|
||||
gguf_writer.add_sep_token_id(key["id"])
|
||||
|
||||
if "pad_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["pad_token"]:
|
||||
gguf_writer.add_pad_token_id(key["id"])
|
||||
gguf_writer.add_token_list(tokens)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
@@ -200,13 +172,15 @@ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||
print("gguf: get tensor metadata")
|
||||
|
||||
if num_parts == 0:
|
||||
part_names = ("pytorch_model.bin",)
|
||||
part_names = iter(("pytorch_model.bin",))
|
||||
else:
|
||||
part_names = (
|
||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||
)
|
||||
|
||||
for part_name in part_names:
|
||||
if args.vocab_only:
|
||||
break
|
||||
print("gguf: loading model part '" + part_name + "'")
|
||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
||||
|
||||
@@ -226,11 +200,8 @@ for part_name in part_names:
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
if name.endswith(".weight") and name[:-7] in tensor_map:
|
||||
name = tensor_map[name[:-7]] + ".weight"
|
||||
elif name.endswith(".bias") and name[:-5] in tensor_map:
|
||||
name = tensor_map[name[:-5]] + ".bias"
|
||||
else:
|
||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
|
||||
@@ -249,19 +220,20 @@ for part_name in part_names:
|
||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
|
||||
gguf_writer.add_tensor(name, data)
|
||||
gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
if not args.vocab_only:
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
print("gguf: model successfully exported to '" + fname_out + "'")
|
||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
||||
print("")
|
||||
|
||||
@@ -1,308 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# 7b pth llama --> gguf conversion
|
||||
# Only models with a single datafile are supported, like 7B
|
||||
# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
|
||||
|
||||
import gguf
|
||||
import os
|
||||
import sys
|
||||
import struct
|
||||
import json
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from typing import Any, List
|
||||
from pathlib import Path
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
#NDArray = np.ndarray[Any, Any]
|
||||
# compatible with python < 3.9
|
||||
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
|
||||
|
||||
|
||||
def count_model_parts(dir_model: str) -> int:
|
||||
num_parts = 0
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith("consolidated."):
|
||||
num_parts += 1
|
||||
|
||||
if num_parts > 0:
|
||||
print("gguf: found " + str(num_parts) + " model parts")
|
||||
return num_parts
|
||||
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
|
||||
print(" ftype == 0 -> float32")
|
||||
print(" ftype == 1 -> float16")
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# output in the same directory as the model
|
||||
dir_model = sys.argv[1]
|
||||
last_dir = os.path.basename(os.path.normpath(dir_model))
|
||||
|
||||
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
ftype = 1
|
||||
if len(sys.argv) > 2:
|
||||
ftype = int(sys.argv[2])
|
||||
if ftype < 0 or ftype > 1:
|
||||
print("Invalid ftype: " + str(ftype))
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
|
||||
|
||||
print("gguf: loading model "+last_dir)
|
||||
|
||||
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
|
||||
if hparams["architectures"][0] != "LlamaForCausalLM":
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
sys.exit()
|
||||
|
||||
# get number of model parts
|
||||
num_parts = count_model_parts(dir_model)
|
||||
|
||||
if num_parts > 1:
|
||||
print("gguf: Only models with a single datafile are supported.")
|
||||
|
||||
sys.exit()
|
||||
|
||||
ARCH=gguf.MODEL_ARCH.LLAMA
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
head_count = hparams["num_attention_heads"]
|
||||
|
||||
if "num_key_value_heads" in hparams:
|
||||
head_count_kv = hparams["num_key_value_heads"]
|
||||
else:
|
||||
head_count_kv = head_count
|
||||
|
||||
if "_name_or_path" in hparams:
|
||||
hf_repo = hparams["_name_or_path"]
|
||||
else:
|
||||
hf_repo = ""
|
||||
|
||||
if "max_sequence_length" in hparams:
|
||||
ctx_length = hparams["max_sequence_length"]
|
||||
elif "max_position_embeddings" in hparams:
|
||||
ctx_length = hparams["max_position_embeddings"]
|
||||
else:
|
||||
print("gguf: can not find ctx length parameter.")
|
||||
|
||||
sys.exit()
|
||||
|
||||
|
||||
gguf_writer.add_name(last_dir)
|
||||
gguf_writer.add_source_hf_repo(hf_repo)
|
||||
gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||
gguf_writer.add_context_length(ctx_length)
|
||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
||||
gguf_writer.add_head_count(head_count)
|
||||
gguf_writer.add_head_count_kv(head_count_kv)
|
||||
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||
|
||||
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
||||
if "type" in hparams["rope_scaling"]:
|
||||
if hparams["rope_scaling"]["type"] == "linear":
|
||||
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
|
||||
|
||||
|
||||
# TOKENIZATION
|
||||
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: List[bytes] = []
|
||||
scores: List[float] = []
|
||||
toktypes: List[int] = []
|
||||
|
||||
if Path(dir_model + "/tokenizer.model").is_file():
|
||||
# vocab type sentencepiece
|
||||
print("gguf: get sentencepiece tokenizer vocab and scores")
|
||||
|
||||
tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
|
||||
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
text: bytes
|
||||
score: float
|
||||
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.get_score(i)
|
||||
|
||||
toktype = 1 # defualt to normal token type
|
||||
if tokenizer.is_unknown(i):
|
||||
toktype = 2
|
||||
if tokenizer.is_control(i):
|
||||
toktype = 3
|
||||
|
||||
# toktype = 4 is user-defined = tokens from added_tokens.json
|
||||
|
||||
if tokenizer.is_unused(i):
|
||||
toktype = 5
|
||||
if tokenizer.is_byte(i):
|
||||
toktype = 6
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
if Path(dir_model + "/added_tokens.json").is_file():
|
||||
with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
|
||||
addtokens_json = json.load(f)
|
||||
|
||||
print("gguf: get added tokens")
|
||||
|
||||
for key in addtokens_json:
|
||||
tokens.append( key.encode("utf-8") )
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(4) # user-defined token type
|
||||
|
||||
gguf_writer.add_tokenizer_model("llama")
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
|
||||
print("gguf: get special token ids")
|
||||
|
||||
if Path(dir_model + "/tokenizer.json").is_file():
|
||||
# Look for special tokens in tokenizer.json if it exists
|
||||
|
||||
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
|
||||
tokenizer = json.load(f)
|
||||
|
||||
if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
|
||||
|
||||
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_config = json.load(f)
|
||||
|
||||
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["bos_token"]["content"]:
|
||||
gguf_writer.add_bos_token_id(key["id"])
|
||||
|
||||
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["eos_token"]["content"]:
|
||||
gguf_writer.add_eos_token_id(key["id"])
|
||||
|
||||
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["unk_token"]["content"]:
|
||||
gguf_writer.add_unk_token_id(key["id"])
|
||||
|
||||
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["sep_token"]["content"]:
|
||||
gguf_writer.add_sep_token_id(key["id"])
|
||||
|
||||
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["pad_token"]["content"]:
|
||||
gguf_writer.add_pad_token_id(key["id"])
|
||||
else:
|
||||
# If no tokenizer.json: Look for special tokens in config.json
|
||||
|
||||
if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
|
||||
gguf_writer.add_bos_token_id(hparams["bos_token_id"])
|
||||
|
||||
if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
|
||||
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
|
||||
|
||||
if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
|
||||
gguf_writer.add_unk_token_id(hparams["unk_token_id"])
|
||||
|
||||
if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
|
||||
gguf_writer.add_sep_token_id(hparams["sep_token_id"])
|
||||
|
||||
if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
|
||||
gguf_writer.add_pad_token_id(hparams["pad_token_id"])
|
||||
|
||||
|
||||
# TENSORS
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||
|
||||
# tensor info
|
||||
print("gguf: get tensor metadata")
|
||||
|
||||
part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))
|
||||
|
||||
for part_name in part_names:
|
||||
print("gguf: loading model part '" + part_name + "'")
|
||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part[name]
|
||||
|
||||
# we don't need these
|
||||
if name == "rope.freqs":
|
||||
continue
|
||||
|
||||
old_dtype = data.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||
data = data.to(torch.float32)
|
||||
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
if name.endswith(".weight") and name[:-7] in tensor_map:
|
||||
name = tensor_map[name[:-7]] + ".weight"
|
||||
elif name.endswith(".bias") and name[:-5] in tensor_map:
|
||||
name = tensor_map[name[:-5]] + ".bias"
|
||||
else:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
|
||||
gguf_writer.add_tensor(name, data)
|
||||
|
||||
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
|
||||
print("gguf: model successfully exported to '" + fname_out + "'")
|
||||
print("")
|
||||
@@ -1,9 +1,17 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys, struct, math, argparse
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import math
|
||||
import struct
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
import os
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
# Note: Does not support GGML_QKK_64
|
||||
@@ -72,10 +80,10 @@ class Vocab:
|
||||
class Tensor:
|
||||
def __init__(self):
|
||||
self.name = None
|
||||
self.dims = ()
|
||||
self.dims: tuple[int, ...] = ()
|
||||
self.dtype = None
|
||||
self.start_offset = 0
|
||||
self.len_bytes = 0
|
||||
self.len_bytes = np.int64(0)
|
||||
|
||||
def load(self, data, offset):
|
||||
orig_offset = offset
|
||||
@@ -119,7 +127,7 @@ class GGMLV3Model:
|
||||
offset += hp.load(data, offset)
|
||||
vocab = Vocab()
|
||||
offset += vocab.load(data, offset, hp.n_vocab)
|
||||
tensors = []
|
||||
tensors: list[Tensor] = []
|
||||
tensor_map = {}
|
||||
while offset < len(data):
|
||||
tensor = Tensor()
|
||||
@@ -134,13 +142,14 @@ class GGMLV3Model:
|
||||
return offset
|
||||
|
||||
class GGMLToGGUF:
|
||||
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None):
|
||||
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
|
||||
hp = ggml_model.hyperparameters
|
||||
self.model = ggml_model
|
||||
self.data = data
|
||||
self.cfg = cfg
|
||||
self.params_override = params_override
|
||||
self.vocab_override = vocab_override
|
||||
self.special_vocab = special_vocab
|
||||
if params_override is not None:
|
||||
n_kv_head = params_override.n_head_kv
|
||||
else:
|
||||
@@ -162,6 +171,8 @@ class GGMLToGGUF:
|
||||
gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
||||
self.add_params(gguf_writer)
|
||||
self.add_vocab(gguf_writer)
|
||||
if self.special_vocab is not None:
|
||||
self.special_vocab.add_to_gguf(gguf_writer)
|
||||
self.add_tensors(gguf_writer)
|
||||
print(" gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
@@ -259,20 +270,13 @@ class GGMLToGGUF:
|
||||
gguf_writer.add_eos_token_id(2)
|
||||
|
||||
def add_tensors(self, gguf_writer):
|
||||
nm = self.name_map
|
||||
tensor_map = self.name_map
|
||||
data = self.data
|
||||
print(f'* Adding {len(self.model.tensors)} tensor(s)')
|
||||
for tensor in self.model.tensors:
|
||||
name = str(tensor.name, 'UTF-8')
|
||||
if name.endswith('.weight'):
|
||||
name = name[:-7]
|
||||
suffix = '.weight'
|
||||
elif name.endswith('.bias'):
|
||||
name = name[:-5]
|
||||
suffix = '.bias'
|
||||
mapped_name = nm.get(name)
|
||||
mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||
assert mapped_name is not None, f'Bad name {name}'
|
||||
mapped_name += suffix
|
||||
tempdims = list(tensor.dims[:])
|
||||
if len(tempdims) > 1:
|
||||
temp = tempdims[1]
|
||||
@@ -302,13 +306,15 @@ def handle_metadata(cfg, hp):
|
||||
else:
|
||||
raise ValueError('Unable to load metadata')
|
||||
vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
|
||||
# FIXME: Respect cfg.vocab_dir?
|
||||
svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
|
||||
convert.check_vocab_size(params, vocab)
|
||||
return (params, vocab)
|
||||
return (params, vocab, svocab)
|
||||
|
||||
def handle_args():
|
||||
parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
|
||||
parser.add_argument('--input', '-i', type = Path, help = 'Input GGMLv3 filename')
|
||||
parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
|
||||
parser.add_argument('--input', '-i', type = Path, required = True, help = 'Input GGMLv3 filename')
|
||||
parser.add_argument('--output', '-o', type = Path, required = True, help ='Output GGUF filename')
|
||||
parser.add_argument('--name', help = 'Set model name')
|
||||
parser.add_argument('--desc', help = 'Set model description')
|
||||
parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
||||
@@ -330,14 +336,16 @@ def main():
|
||||
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
||||
vocab_override = None
|
||||
params_override = None
|
||||
special_vocab = None
|
||||
if cfg.model_metadata_dir is not None:
|
||||
(params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters)
|
||||
(params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
|
||||
print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
|
||||
print(f'* Overriding params: {params_override}')
|
||||
print(f'* Overriding vocab: {vocab_override}')
|
||||
print(f'* Special vocab: {special_vocab}')
|
||||
else:
|
||||
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
||||
converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override)
|
||||
converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab)
|
||||
converter.save()
|
||||
print(f'* Successful completion. Output saved to: {cfg.output}')
|
||||
|
||||
|
||||
@@ -1,328 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# HF llama --> gguf conversion
|
||||
|
||||
import gguf
|
||||
import os
|
||||
import sys
|
||||
import struct
|
||||
import json
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from typing import Any, List, Optional
|
||||
from pathlib import Path
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
#NDArray = np.ndarray[Any, Any]
|
||||
# compatible with python < 3.9
|
||||
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
|
||||
|
||||
# reverse HF permute back to original pth layout
|
||||
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
|
||||
|
||||
|
||||
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
|
||||
if n_kv_head is not None and n_head != n_kv_head:
|
||||
n_head //= n_kv_head
|
||||
|
||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(weights.shape))
|
||||
|
||||
|
||||
def count_model_parts(dir_model: str) -> int:
|
||||
num_parts = 0
|
||||
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith("pytorch_model-"):
|
||||
num_parts += 1
|
||||
|
||||
if num_parts > 0:
|
||||
print("gguf: found " + str(num_parts) + " model parts")
|
||||
|
||||
return num_parts
|
||||
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
|
||||
print(" ftype == 0 -> float32")
|
||||
print(" ftype == 1 -> float16")
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# output in the same directory as the model
|
||||
dir_model = sys.argv[1]
|
||||
last_dir = os.path.basename(os.path.normpath(dir_model))
|
||||
|
||||
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
|
||||
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
ftype = 1
|
||||
if len(sys.argv) > 2:
|
||||
ftype = int(sys.argv[2])
|
||||
if ftype < 0 or ftype > 1:
|
||||
print("Invalid ftype: " + str(ftype))
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
|
||||
|
||||
print("gguf: loading model "+last_dir)
|
||||
|
||||
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
|
||||
if hparams["architectures"][0] != "LlamaForCausalLM":
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
|
||||
sys.exit()
|
||||
|
||||
# get number of model parts
|
||||
num_parts = count_model_parts(dir_model)
|
||||
|
||||
ARCH=gguf.MODEL_ARCH.LLAMA
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
head_count = hparams["num_attention_heads"]
|
||||
|
||||
if "num_key_value_heads" in hparams:
|
||||
head_count_kv = hparams["num_key_value_heads"]
|
||||
else:
|
||||
head_count_kv = head_count
|
||||
|
||||
if "_name_or_path" in hparams:
|
||||
hf_repo = hparams["_name_or_path"]
|
||||
else:
|
||||
hf_repo = ""
|
||||
|
||||
if "max_sequence_length" in hparams:
|
||||
ctx_length = hparams["max_sequence_length"]
|
||||
elif "max_position_embeddings" in hparams:
|
||||
ctx_length = hparams["max_position_embeddings"]
|
||||
else:
|
||||
print("gguf: can not find ctx length parameter.")
|
||||
|
||||
sys.exit()
|
||||
|
||||
|
||||
gguf_writer.add_name(last_dir)
|
||||
gguf_writer.add_source_hf_repo(hf_repo)
|
||||
gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||
gguf_writer.add_context_length(ctx_length)
|
||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
||||
gguf_writer.add_head_count(head_count)
|
||||
gguf_writer.add_head_count_kv(head_count_kv)
|
||||
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||
|
||||
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
||||
if "type" in hparams["rope_scaling"]:
|
||||
if hparams["rope_scaling"]["type"] == "linear":
|
||||
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
|
||||
|
||||
|
||||
# TOKENIZATION
|
||||
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: List[bytes] = []
|
||||
scores: List[float] = []
|
||||
toktypes: List[int] = []
|
||||
|
||||
if Path(dir_model + "/tokenizer.model").is_file():
|
||||
# vocab type sentencepiece
|
||||
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
||||
|
||||
tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
|
||||
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
text: bytes
|
||||
score: float
|
||||
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.get_score(i)
|
||||
|
||||
toktype = 1 # defualt to normal token type
|
||||
if tokenizer.is_unknown(i):
|
||||
toktype = 2
|
||||
if tokenizer.is_control(i):
|
||||
toktype = 3
|
||||
|
||||
# toktype = 4 is user-defined = tokens from added_tokens.json
|
||||
|
||||
if tokenizer.is_unused(i):
|
||||
toktype = 5
|
||||
if tokenizer.is_byte(i):
|
||||
toktype = 6
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
if Path(dir_model + "/added_tokens.json").is_file():
|
||||
with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
|
||||
addtokens_json = json.load(f)
|
||||
|
||||
print("gguf: get added tokens")
|
||||
|
||||
for key in addtokens_json:
|
||||
tokens.append( key.encode("utf-8") )
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(4) # user-defined token type
|
||||
|
||||
|
||||
gguf_writer.add_tokenizer_model("llama")
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
|
||||
print("gguf: get special token ids")
|
||||
|
||||
if Path(dir_model + "/tokenizer.json").is_file():
|
||||
# Look for special tokens in tokenizer.json if it exists
|
||||
|
||||
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
|
||||
tokenizer = json.load(f)
|
||||
|
||||
if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
|
||||
|
||||
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_config = json.load(f)
|
||||
|
||||
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["bos_token"]["content"]:
|
||||
gguf_writer.add_bos_token_id(key["id"])
|
||||
|
||||
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["eos_token"]["content"]:
|
||||
gguf_writer.add_eos_token_id(key["id"])
|
||||
|
||||
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["unk_token"]["content"]:
|
||||
gguf_writer.add_unk_token_id(key["id"])
|
||||
|
||||
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["sep_token"]["content"]:
|
||||
gguf_writer.add_sep_token_id(key["id"])
|
||||
|
||||
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
|
||||
for key in tokenizer["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["pad_token"]["content"]:
|
||||
gguf_writer.add_pad_token_id(key["id"])
|
||||
else:
|
||||
# If no tokenizer.json: Look for special tokens in config.json
|
||||
|
||||
if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
|
||||
gguf_writer.add_bos_token_id(hparams["bos_token_id"])
|
||||
|
||||
if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
|
||||
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
|
||||
|
||||
if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
|
||||
gguf_writer.add_unk_token_id(hparams["unk_token_id"])
|
||||
|
||||
if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
|
||||
gguf_writer.add_sep_token_id(hparams["sep_token_id"])
|
||||
|
||||
if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
|
||||
gguf_writer.add_pad_token_id(hparams["pad_token_id"])
|
||||
|
||||
|
||||
# TENSORS
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||
|
||||
# tensor info
|
||||
print("gguf: get tensor metadata")
|
||||
|
||||
if num_parts == 0:
|
||||
part_names = ("pytorch_model.bin",)
|
||||
else:
|
||||
part_names = (
|
||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||
)
|
||||
|
||||
for part_name in part_names:
|
||||
print("gguf: loading model part '" + part_name + "'")
|
||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part[name]
|
||||
|
||||
# we don't need these
|
||||
if name.endswith(".rotary_emb.inv_freq"):
|
||||
continue
|
||||
|
||||
old_dtype = data.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||
data = data.to(torch.float32)
|
||||
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
# reverse permute these
|
||||
if name.endswith(".q_proj.weight"):
|
||||
data = reverse_hf_permute(data, head_count)
|
||||
if name.endswith(".k_proj.weight"):
|
||||
data = reverse_hf_permute(data, head_count, head_count_kv)
|
||||
|
||||
# map tensor names
|
||||
if name.endswith(".weight") and name[:-7] in tensor_map:
|
||||
name = tensor_map[name[:-7]] + ".weight"
|
||||
elif name.endswith(".bias") and name[:-5] in tensor_map:
|
||||
name = tensor_map[name[:-5]] + ".bias"
|
||||
else:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
|
||||
gguf_writer.add_tensor(name, data)
|
||||
|
||||
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
|
||||
print("gguf: model successfully exported to '" + fname_out + "'")
|
||||
print("")
|
||||
@@ -1,15 +1,17 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import struct
|
||||
import sys
|
||||
from typing import Any, Dict, Sequence, TextIO
|
||||
from typing import Any, BinaryIO, Sequence
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
NUMPY_TYPE_TO_FTYPE: Dict[str, int] = {"float32": 0, "float16": 1}
|
||||
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
||||
|
||||
|
||||
HF_SUBLAYER_TO_GGML = {
|
||||
@@ -46,7 +48,7 @@ def translate_tensor_name(t: str) -> str:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
|
||||
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
|
||||
fout.write(b"ggla"[::-1]) # magic (ggml lora)
|
||||
fout.write(struct.pack("i", 1)) # file version
|
||||
fout.write(struct.pack("i", params["r"]))
|
||||
@@ -60,7 +62,7 @@ def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
|
||||
|
||||
|
||||
def write_tensor_header(
|
||||
self, name: str, shape: Sequence[int], data_type: np.dtype
|
||||
self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
|
||||
) -> None:
|
||||
sname = name.encode("utf-8")
|
||||
fout.write(
|
||||
|
||||
493
convert.py
493
convert.py
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import gguf
|
||||
import argparse
|
||||
import concurrent.futures
|
||||
import copy
|
||||
@@ -17,52 +17,99 @@ import re
|
||||
import signal
|
||||
import struct
|
||||
import sys
|
||||
import time
|
||||
import zipfile
|
||||
import numpy as np
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union)
|
||||
from sentencepiece import SentencePieceProcessor # type: ignore
|
||||
from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar
|
||||
|
||||
import numpy as np
|
||||
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
||||
|
||||
import os
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing_extensions import TypeAlias
|
||||
from typing import TypeAlias
|
||||
|
||||
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
|
||||
faulthandler.register(signal.SIGUSR1)
|
||||
|
||||
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
|
||||
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
||||
|
||||
ARCH=gguf.MODEL_ARCH.LLAMA
|
||||
NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
|
||||
|
||||
DEFAULT_CONCURRENCY = 8
|
||||
#
|
||||
# data types
|
||||
#
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class UnquantizedDataType:
|
||||
class DataType:
|
||||
name: str
|
||||
dtype: np.dtype[Any]
|
||||
valid_conversions: list[str]
|
||||
|
||||
DT_F16 = UnquantizedDataType('F16')
|
||||
DT_F32 = UnquantizedDataType('F32')
|
||||
DT_I32 = UnquantizedDataType('I32')
|
||||
DT_BF16 = UnquantizedDataType('BF16')
|
||||
def elements_to_bytes(self, n_elements: int) -> int:
|
||||
return n_elements * self.dtype.itemsize
|
||||
|
||||
DataType = Union[UnquantizedDataType]
|
||||
@dataclass(frozen=True)
|
||||
class UnquantizedDataType(DataType):
|
||||
pass
|
||||
|
||||
DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
|
||||
DT_BF16: np.dtype(np.uint16),
|
||||
DT_F16: np.dtype(np.float16),
|
||||
DT_F32: np.dtype(np.float32),
|
||||
DT_I32: np.dtype(np.int32),
|
||||
}
|
||||
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
||||
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
||||
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
||||
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
||||
|
||||
NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \
|
||||
{dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
|
||||
@dataclass(frozen=True)
|
||||
class QuantizedDataType(DataType):
|
||||
block_size: int
|
||||
quantized_dtype: np.dtype[Any]
|
||||
ggml_type: gguf.GGMLQuantizationType
|
||||
|
||||
SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
|
||||
def quantize(self, arr: NDArray) -> NDArray:
|
||||
raise NotImplementedError(f'Quantization for {self.name} not implemented')
|
||||
|
||||
def elements_to_bytes(self, n_elements: int) -> int:
|
||||
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
|
||||
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Q8_0QuantizedDataType(QuantizedDataType):
|
||||
# Mini Q8_0 quantization in Python!
|
||||
def quantize(self, arr: NDArray) -> NDArray:
|
||||
assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}'
|
||||
assert arr.dtype == np.float32, f'Bad array type {arr.dtype}'
|
||||
n_blocks = arr.size // self.block_size
|
||||
blocks = arr.reshape((n_blocks, self.block_size))
|
||||
# Much faster implementation of block quantization contributed by @Cebtenzzre
|
||||
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
|
||||
d = abs(blocks).max(axis = 1) / np.float32(127)
|
||||
with np.errstate(divide = 'ignore'):
|
||||
qs = (blocks / d[:, None]).round()
|
||||
qs[d == 0] = 0
|
||||
yield from zip(d, qs)
|
||||
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
|
||||
|
||||
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
|
||||
dtype = np.dtype(np.float32), valid_conversions = [],
|
||||
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
|
||||
quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
|
||||
|
||||
# Quantized types skipped here because they may also map to np.float32
|
||||
NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
|
||||
for dt in (DT_BF16, DT_F16, DT_F32, DT_I32):
|
||||
if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE:
|
||||
raise ValueError(f'Invalid duplicate data type {dt}')
|
||||
NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt
|
||||
|
||||
SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
|
||||
'BF16': DT_BF16,
|
||||
'F16': DT_F16,
|
||||
'F32': DT_F32,
|
||||
@@ -73,20 +120,22 @@ SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
|
||||
# TODO: rename to LLAMAFileType
|
||||
# TODO: move to `gguf.py`
|
||||
class GGMLFileType(enum.IntEnum):
|
||||
AllF32 = 0
|
||||
MostlyF16 = 1 # except 1d tensors
|
||||
AllF32 = 0
|
||||
MostlyF16 = 1 # except 1d tensors
|
||||
MostlyQ8_0 = 7 # except 1d tensors
|
||||
|
||||
def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
|
||||
if len(tensor.shape) == 1:
|
||||
# 1D tensors are always F32.
|
||||
return DT_F32
|
||||
elif self == GGMLFileType.AllF32:
|
||||
return DT_F32
|
||||
elif self == GGMLFileType.MostlyF16:
|
||||
return DT_F16
|
||||
else:
|
||||
def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType:
|
||||
dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
|
||||
if dt is None:
|
||||
raise ValueError(self)
|
||||
# 1D tensors are always F32.
|
||||
return dt if len(tensor.shape) > 1 else DT_F32
|
||||
|
||||
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
||||
GGMLFileType.AllF32 : DT_F32,
|
||||
GGMLFileType.MostlyF16 : DT_F16,
|
||||
GGMLFileType.MostlyQ8_0: DT_Q8_0,
|
||||
}
|
||||
|
||||
#
|
||||
# hparams loading
|
||||
@@ -104,13 +153,13 @@ class Params:
|
||||
n_head_kv: int
|
||||
f_norm_eps: float
|
||||
|
||||
f_rope_freq_base: Optional[float] = None
|
||||
f_rope_scale: Optional[float] = None
|
||||
f_rope_freq_base: float | None = None
|
||||
f_rope_scale: float | None = None
|
||||
|
||||
ftype: Optional[GGMLFileType] = None
|
||||
ftype: GGMLFileType | None = None
|
||||
|
||||
# path to the directory containing the model files
|
||||
path_model: Optional['Path'] = None
|
||||
path_model: Path | None = None
|
||||
|
||||
@staticmethod
|
||||
def find_n_mult(n_ff: int, n_embd: int) -> int:
|
||||
@@ -122,7 +171,7 @@ class Params:
|
||||
raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
|
||||
|
||||
@staticmethod
|
||||
def guessed(model: 'LazyModel') -> 'Params':
|
||||
def guessed(model: LazyModel) -> Params:
|
||||
# try transformer naming first
|
||||
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
|
||||
|
||||
@@ -158,7 +207,7 @@ class Params:
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
|
||||
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
|
||||
config = json.load(open(config_path))
|
||||
|
||||
n_vocab = config["vocab_size"]
|
||||
@@ -203,7 +252,7 @@ class Params:
|
||||
# LLaMA v2 70B params.json
|
||||
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1
|
||||
@staticmethod
|
||||
def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
|
||||
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
||||
config = json.load(open(config_path))
|
||||
|
||||
n_vocab = config["vocab_size"] if "vocab_size" in config else -1
|
||||
@@ -247,7 +296,7 @@ class Params:
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def load(model_plus: 'ModelPlus') -> 'Params':
|
||||
def load(model_plus: ModelPlus) -> Params:
|
||||
hf_config_path = model_plus.paths[0].parent / "config.json"
|
||||
orig_config_path = model_plus.paths[0].parent / "params.json"
|
||||
|
||||
@@ -255,8 +304,10 @@ class Params:
|
||||
params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
|
||||
elif orig_config_path.exists():
|
||||
params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
|
||||
else:
|
||||
elif model_plus.format != 'none':
|
||||
params = Params.guessed(model_plus.model)
|
||||
else:
|
||||
raise ValueError('Cannot guess params when model format is none')
|
||||
|
||||
params.path_model = model_plus.paths[0].parent
|
||||
|
||||
@@ -268,19 +319,31 @@ class Params:
|
||||
#
|
||||
|
||||
class BpeVocab:
|
||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
|
||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
||||
added_tokens: Dict[str, int]
|
||||
added_tokens: dict[str, int]
|
||||
if fname_added_tokens is not None:
|
||||
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
||||
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
||||
else:
|
||||
added_tokens = {}
|
||||
# Fall back to trying to find the added tokens in tokenizer.json
|
||||
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
|
||||
if not tokenizer_json_file.is_file():
|
||||
added_tokens = {}
|
||||
else:
|
||||
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
|
||||
added_tokens = dict(
|
||||
(item['content'], item['id'])
|
||||
for item in tokenizer_json.get('added_tokens', [])
|
||||
# Added tokens here can be duplicates of the main vocabulary.
|
||||
if item['content'] not in self.bpe_tokenizer )
|
||||
|
||||
vocab_size: int = len(self.bpe_tokenizer)
|
||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||
actual_ids = sorted(added_tokens.values())
|
||||
if expected_ids != actual_ids:
|
||||
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
|
||||
expected_end_id = vocab_size + len(actual_ids) - 1
|
||||
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
|
||||
|
||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
||||
self.added_tokens_list = [text for (text, idx) in items]
|
||||
@@ -289,33 +352,45 @@ class BpeVocab:
|
||||
self.fname_tokenizer = fname_tokenizer
|
||||
self.fname_added_tokens = fname_added_tokens
|
||||
|
||||
def bpe_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
|
||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
tokenizer = self.bpe_tokenizer
|
||||
from transformers.models.gpt2 import tokenization_gpt2
|
||||
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
|
||||
byte_encoder = tokenization_gpt2.bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
score = 0.0
|
||||
for i, item in enumerate(tokenizer):
|
||||
text: bytes = item.encode("utf-8")
|
||||
score: float = -i
|
||||
yield text, score, gguf.TokenType.USER_DEFINED
|
||||
# FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
|
||||
if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
|
||||
if i == 0 and text == b'<unk>':
|
||||
toktype = gguf.TokenType.UNKNOWN
|
||||
elif i == 1 or i == 2:
|
||||
toktype = gguf.TokenType.CONTROL
|
||||
elif i >= 3 and text.startswith(b'<0x'):
|
||||
toktype = gguf.TokenType.BYTE
|
||||
else:
|
||||
toktype = gguf.TokenType.NORMAL
|
||||
else:
|
||||
toktype = gguf.TokenType.NORMAL
|
||||
yield text, score, toktype
|
||||
|
||||
def added_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
|
||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
for text in self.added_tokens_list:
|
||||
score = -1000.0
|
||||
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
||||
|
||||
def all_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
|
||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
yield from self.bpe_tokens()
|
||||
yield from self.added_tokens()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||
|
||||
|
||||
class SentencePieceVocab:
|
||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
|
||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
||||
added_tokens: Dict[str, int]
|
||||
added_tokens: dict[str, int]
|
||||
if fname_added_tokens is not None:
|
||||
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
||||
else:
|
||||
@@ -334,7 +409,7 @@ class SentencePieceVocab:
|
||||
self.fname_tokenizer = fname_tokenizer
|
||||
self.fname_added_tokens = fname_added_tokens
|
||||
|
||||
def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
|
||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
tokenizer = self.sentencepiece_tokenizer
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
@@ -358,20 +433,19 @@ class SentencePieceVocab:
|
||||
|
||||
yield text, score, toktype
|
||||
|
||||
def added_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
|
||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
for text in self.added_tokens_list:
|
||||
score = -1000.0
|
||||
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
||||
|
||||
def all_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
|
||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
yield from self.sentencepiece_tokens()
|
||||
yield from self.added_tokens()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||
|
||||
Vocab = Union[BpeVocab, SentencePieceVocab]
|
||||
|
||||
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
||||
|
||||
#
|
||||
# data loading
|
||||
@@ -391,18 +465,18 @@ class Tensor(metaclass=ABCMeta):
|
||||
data_type: DataType
|
||||
|
||||
@abstractmethod
|
||||
def astype(self, data_type: DataType) -> 'Tensor': ...
|
||||
def astype(self, data_type: DataType) -> Tensor: ...
|
||||
@abstractmethod
|
||||
def permute(self, n_head: int, n_head_kv: int) -> 'Tensor': ...
|
||||
def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
|
||||
@abstractmethod
|
||||
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
|
||||
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
|
||||
@abstractmethod
|
||||
def part(self, n_part: int) -> 'UnquantizedTensor': ...
|
||||
def part(self, n_part: int) -> UnquantizedTensor: ...
|
||||
@abstractmethod
|
||||
def to_ggml(self) -> 'GGMLCompatibleTensor': ...
|
||||
def to_ggml(self) -> GGMLCompatibleTensor: ...
|
||||
|
||||
|
||||
def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray:
|
||||
def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
|
||||
assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
|
||||
fp32_arr = bf16_arr.astype(np.uint32) << 16
|
||||
return fp32_arr.view(np.float32)
|
||||
@@ -415,27 +489,27 @@ class UnquantizedTensor(Tensor):
|
||||
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
||||
|
||||
def astype(self, data_type: DataType) -> Tensor:
|
||||
dtype = DATA_TYPE_TO_NUMPY[data_type]
|
||||
dtype = data_type.dtype
|
||||
if self.data_type == DT_BF16:
|
||||
self.ndarray = bf16_to_fp32(self.ndarray)
|
||||
return UnquantizedTensor(self.ndarray.astype(dtype))
|
||||
|
||||
def to_ggml(self) -> 'UnquantizedTensor':
|
||||
def to_ggml(self) -> UnquantizedTensor:
|
||||
return self
|
||||
|
||||
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
|
||||
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
|
||||
r = self.ndarray.shape[0] // 3
|
||||
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
|
||||
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
|
||||
|
||||
def part(self, n_part: int) -> 'UnquantizedTensor':
|
||||
def part(self, n_part: int) -> UnquantizedTensor:
|
||||
r = self.ndarray.shape[0] // 3
|
||||
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
|
||||
|
||||
def permute(self, n_head: int, n_head_kv: int) -> 'UnquantizedTensor':
|
||||
def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor:
|
||||
return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))
|
||||
|
||||
|
||||
def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
|
||||
def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray:
|
||||
tensor = lazy_tensor.load()
|
||||
assert isinstance(tensor, UnquantizedTensor)
|
||||
|
||||
@@ -451,38 +525,24 @@ def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, conv
|
||||
return tensor.ndarray
|
||||
|
||||
|
||||
GGMLCompatibleTensor = Union[UnquantizedTensor]
|
||||
|
||||
|
||||
class DeferredPermutedTensor(Tensor):
|
||||
def __init__(self, base: Tensor, n_head: int, n_head_kv: int) -> None:
|
||||
self.base = base
|
||||
self.n_head = n_head
|
||||
self.data_type = self.base.data_type
|
||||
|
||||
def astype(self, data_type: DataType) -> Tensor:
|
||||
return self.base.astype(data_type).permute(self.n_head, self.n_head_kv)
|
||||
|
||||
def to_ggml(self) -> GGMLCompatibleTensor:
|
||||
return self.base.to_ggml().permute(self.n_head, self.n_head_kv)
|
||||
|
||||
def permute(self, n_head: int, n_head_kv: int) -> Tensor:
|
||||
raise Exception("shouldn't permute twice")
|
||||
GGMLCompatibleTensor = UnquantizedTensor
|
||||
|
||||
|
||||
@dataclass
|
||||
class LazyTensor:
|
||||
_load: Callable[[], Tensor]
|
||||
shape: List[int]
|
||||
shape: list[int]
|
||||
data_type: DataType
|
||||
description: str
|
||||
|
||||
def load(self) -> Tensor:
|
||||
ret = self._load()
|
||||
assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
|
||||
# Should be okay if it maps to the same numpy type?
|
||||
assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
|
||||
(self.data_type, ret.data_type, self.description)
|
||||
return ret
|
||||
|
||||
def astype(self, data_type: DataType) -> 'LazyTensor':
|
||||
def astype(self, data_type: DataType) -> LazyTensor:
|
||||
self.validate_conversion_to(data_type)
|
||||
|
||||
def load() -> Tensor:
|
||||
@@ -490,28 +550,28 @@ class LazyTensor:
|
||||
return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
|
||||
|
||||
def validate_conversion_to(self, data_type: DataType) -> None:
|
||||
if data_type == self.data_type:
|
||||
return
|
||||
if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions:
|
||||
raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
|
||||
|
||||
|
||||
LazyModel = Dict[str, LazyTensor]
|
||||
LazyModel: TypeAlias = 'dict[str, LazyTensor]'
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelPlus:
|
||||
model: LazyModel
|
||||
paths: List[Path] # Where this was read from.
|
||||
format: Literal['ggml', 'torch', 'safetensors']
|
||||
vocab: Optional[Vocab] # For GGML models (which have vocab built in), the vocab.
|
||||
paths: list[Path] # Where this was read from.
|
||||
format: Literal['ggml', 'torch', 'safetensors', 'none']
|
||||
vocab: Vocab | None # For GGML models (which have vocab built in), the vocab.
|
||||
|
||||
|
||||
def merge_sharded(models: List[LazyModel]) -> LazyModel:
|
||||
def merge_sharded(models: list[LazyModel]) -> LazyModel:
|
||||
# Original LLaMA models have each file contain one part of each tensor.
|
||||
# Use a dict instead of a set to preserve order.
|
||||
names = {name: None for model in models for name in model}
|
||||
|
||||
def convert(name: str) -> LazyTensor:
|
||||
lazy_tensors: List[LazyTensor] = [model[name] for model in models]
|
||||
lazy_tensors: list[LazyTensor] = [model[name] for model in models]
|
||||
if len(lazy_tensors) == 1:
|
||||
# only one file; don't go through this procedure since there might
|
||||
# be quantized tensors
|
||||
@@ -539,7 +599,7 @@ def merge_sharded(models: List[LazyModel]) -> LazyModel:
|
||||
return {name: convert(name) for name in names}
|
||||
|
||||
|
||||
def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
|
||||
def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
|
||||
formats = set(mp.format for mp in models_plus)
|
||||
assert len(formats) == 1, "different formats?"
|
||||
format = formats.pop()
|
||||
@@ -567,12 +627,12 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe
|
||||
return lazy_tensor.load().permute(n_head, n_head_kv)
|
||||
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
||||
|
||||
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
|
||||
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
|
||||
def load() -> Tensor:
|
||||
return lazy_tensor.load().permute_part(n_part, n_head)
|
||||
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
|
||||
s = lazy_tensor.shape.copy()
|
||||
s[0] = s[0] // 3
|
||||
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
|
||||
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
||||
|
||||
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
||||
def load() -> Tensor:
|
||||
@@ -617,9 +677,7 @@ class LazyUnpickler(pickle.Unpickler):
|
||||
info = self.zip_file.getinfo(filename)
|
||||
|
||||
def load(offset: int, elm_count: int) -> NDArray:
|
||||
dtype = DATA_TYPE_TO_NUMPY.get(data_type)
|
||||
if dtype is None:
|
||||
raise Exception("tensor stored in unsupported format")
|
||||
dtype = data_type.dtype
|
||||
fp = self.zip_file.open(info)
|
||||
fp.seek(offset * dtype.itemsize)
|
||||
size = elm_count * dtype.itemsize
|
||||
@@ -629,7 +687,7 @@ class LazyUnpickler(pickle.Unpickler):
|
||||
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
|
||||
return LazyStorage(load=load, kind=pid[1], description=description)
|
||||
|
||||
# @staticmethod
|
||||
@staticmethod
|
||||
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
|
||||
# pyright: ignore[reportSelfClsParameterName]
|
||||
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
|
||||
@@ -641,13 +699,15 @@ class LazyUnpickler(pickle.Unpickler):
|
||||
description = f'pickled storage_offset={storage_offset} in {storage.description}'
|
||||
return LazyTensor(load, list(size), storage.kind.data_type, description)
|
||||
|
||||
# @staticmethod
|
||||
@staticmethod
|
||||
def rebuild_from_type_v2(func, new_type, args, state):
|
||||
return func(*args)
|
||||
|
||||
CLASSES: Dict[Any, Any] = {
|
||||
('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2,
|
||||
('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2,
|
||||
CLASSES: dict[tuple[str, str], Any] = {
|
||||
# getattr used here as a workaround for mypy not being smart enough to detrmine
|
||||
# the staticmethods have a __func__ attribute.
|
||||
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
||||
('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
|
||||
('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
|
||||
('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
|
||||
('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
|
||||
@@ -676,15 +736,15 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
|
||||
|
||||
def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
|
||||
header_size, = struct.unpack('<Q', fp.read(8))
|
||||
header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
|
||||
header: dict[str, dict[str, Any]] = json.loads(fp.read(header_size))
|
||||
# Use mmap for the actual data to avoid race conditions with the file offset.
|
||||
mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
|
||||
byte_buf = mapped[8 + header_size:]
|
||||
|
||||
def convert(info: Dict[str, Any]) -> LazyTensor:
|
||||
def convert(info: dict[str, Any]) -> LazyTensor:
|
||||
data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
|
||||
numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
|
||||
shape: List[int] = info['shape']
|
||||
numpy_dtype = data_type.dtype
|
||||
shape: list[int] = info['shape']
|
||||
begin, end = info['data_offsets']
|
||||
assert 0 <= begin <= end <= len(byte_buf)
|
||||
assert end - begin == math.prod(shape) * numpy_dtype.itemsize
|
||||
@@ -723,23 +783,40 @@ def lazy_load_file(path: Path) -> ModelPlus:
|
||||
In = TypeVar('In')
|
||||
Out = TypeVar('Out')
|
||||
|
||||
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
|
||||
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
|
||||
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
||||
fast enough, this will stop calling `func` at some point rather than
|
||||
letting results pile up in memory. Specifically, there is a max of one
|
||||
output value buffered per thread.'''
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
futures: List[concurrent.futures.Future[Out]] = []
|
||||
items_rev = list(iterable)[::-1]
|
||||
for i in range(min(concurrency, len(items_rev))):
|
||||
futures.append(executor.submit(func, items_rev.pop()))
|
||||
if concurrency < 2:
|
||||
yield from map(func, iterable)
|
||||
# Not reached.
|
||||
iterable = iter(iterable)
|
||||
executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor]
|
||||
if use_processpool_executor:
|
||||
executor_class = ProcessPoolExecutor
|
||||
else:
|
||||
executor_class = ThreadPoolExecutor
|
||||
with executor_class(max_workers = max_workers) as executor:
|
||||
futures: list[concurrent.futures.Future[Out]] = []
|
||||
done = False
|
||||
for _ in range(concurrency):
|
||||
try:
|
||||
futures.append(executor.submit(func, next(iterable)))
|
||||
except StopIteration:
|
||||
done = True
|
||||
break
|
||||
|
||||
while futures:
|
||||
result = futures.pop(0).result()
|
||||
if items_rev:
|
||||
futures.append(executor.submit(func, items_rev.pop()))
|
||||
while not done and len(futures) < concurrency:
|
||||
try:
|
||||
futures.append(executor.submit(func, next(iterable)))
|
||||
except StopIteration:
|
||||
done = True
|
||||
break
|
||||
yield result
|
||||
|
||||
|
||||
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||
if params.n_vocab != vocab.vocab_size:
|
||||
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
||||
@@ -763,10 +840,12 @@ class OutputFile:
|
||||
|
||||
def add_meta_arch(self, params: Params) -> None:
|
||||
name = "LLaMA"
|
||||
|
||||
# TODO: better logic to determine model name
|
||||
if (params.n_ctx == 4096):
|
||||
name = "LLaMA v2"
|
||||
if params.path_model:
|
||||
name = str(params.path_model.parent).split('/')[-1]
|
||||
elif params.path_model:
|
||||
name = str(params.path_model.parent).split('/')[-1]
|
||||
|
||||
self.gguf.add_name (name)
|
||||
self.gguf.add_context_length (params.n_ctx)
|
||||
@@ -791,25 +870,31 @@ class OutputFile:
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
# NOTE: `all_tokens` returns the the base vocabulary and added tokens
|
||||
# TODO: add special tokens?
|
||||
# NOTE: `all_tokens` returns the base vocabulary and added tokens
|
||||
for text, score, toktype in vocab.all_tokens():
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
self.gguf.add_tokenizer_model("llama")
|
||||
if isinstance(vocab, SentencePieceVocab):
|
||||
self.gguf.add_tokenizer_model("llama")
|
||||
elif isinstance(vocab, BpeVocab):
|
||||
self.gguf.add_tokenizer_model("gpt2")
|
||||
else:
|
||||
raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab')
|
||||
self.gguf.add_token_list(tokens)
|
||||
self.gguf.add_token_scores(scores)
|
||||
self.gguf.add_token_types(toktypes)
|
||||
|
||||
def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None:
|
||||
svocab.add_to_gguf(self.gguf)
|
||||
|
||||
def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
|
||||
n_elements = 1
|
||||
for dim in tensor.shape:
|
||||
n_elements *= dim
|
||||
data_type = DATA_TYPE_TO_NUMPY[tensor.data_type]
|
||||
data_nbytes = n_elements * data_type.itemsize
|
||||
self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes)
|
||||
n_elements = int(np.prod(tensor.shape))
|
||||
raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
|
||||
data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
|
||||
data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
|
||||
self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype = raw_dtype)
|
||||
|
||||
def write_meta(self) -> None:
|
||||
self.gguf.write_header_to_file()
|
||||
@@ -822,7 +907,7 @@ class OutputFile:
|
||||
self.gguf.close()
|
||||
|
||||
@staticmethod
|
||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab) -> None:
|
||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
|
||||
check_vocab_size(params, vocab)
|
||||
|
||||
of = OutputFile(fname_out)
|
||||
@@ -830,12 +915,27 @@ class OutputFile:
|
||||
# meta data
|
||||
of.add_meta_arch(params)
|
||||
of.add_meta_vocab(vocab)
|
||||
of.add_meta_special_vocab(svocab)
|
||||
|
||||
of.write_meta()
|
||||
|
||||
of.close()
|
||||
|
||||
@staticmethod
|
||||
def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
|
||||
def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]:
|
||||
name, lazy_tensor = item
|
||||
tensor = lazy_tensor.load().to_ggml()
|
||||
return (lazy_tensor.data_type, tensor.ndarray)
|
||||
|
||||
@staticmethod
|
||||
def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
|
||||
dt, arr = item
|
||||
if not isinstance(dt, QuantizedDataType):
|
||||
return arr
|
||||
return dt.quantize(arr)
|
||||
|
||||
@staticmethod
|
||||
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
|
||||
check_vocab_size(params, vocab)
|
||||
|
||||
of = OutputFile(fname_out)
|
||||
@@ -843,6 +943,7 @@ class OutputFile:
|
||||
# meta data
|
||||
of.add_meta_arch(params)
|
||||
of.add_meta_vocab(vocab)
|
||||
of.add_meta_special_vocab(svocab)
|
||||
|
||||
# tensor info
|
||||
for name, lazy_tensor in model.items():
|
||||
@@ -851,27 +952,32 @@ class OutputFile:
|
||||
of.write_meta()
|
||||
of.write_tensor_info()
|
||||
|
||||
def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
|
||||
name, lazy_tensor = item
|
||||
return lazy_tensor.load().to_ggml().ndarray
|
||||
|
||||
# tensor data
|
||||
ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
|
||||
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
|
||||
if ftype == GGMLFileType.MostlyQ8_0:
|
||||
ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, use_processpool_executor = True)
|
||||
else:
|
||||
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
|
||||
|
||||
start = time.time()
|
||||
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
|
||||
elapsed = time.time() - start
|
||||
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
||||
padi = len(str(len(model)))
|
||||
print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
|
||||
print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
|
||||
of.gguf.write_tensor_data(ndarray)
|
||||
|
||||
of.close()
|
||||
|
||||
def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
|
||||
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
||||
wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
|
||||
|
||||
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
||||
return GGMLFileType.AllF32
|
||||
if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
|
||||
return GGMLFileType.MostlyF16
|
||||
if output_type_str == "q8_0":
|
||||
return GGMLFileType.MostlyQ8_0
|
||||
|
||||
name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
|
||||
|
||||
@@ -882,7 +988,8 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
|
||||
for (name, tensor) in model.items()}
|
||||
|
||||
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||
tmap = gguf.get_tensor_name_map(ARCH, params.n_layer)
|
||||
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
||||
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
||||
|
||||
tmp = model
|
||||
|
||||
@@ -898,37 +1005,31 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
|
||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
|
||||
tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
|
||||
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
|
||||
else:
|
||||
break
|
||||
|
||||
out: LazyModel = {}
|
||||
for name, lazy_tensor in model.items():
|
||||
name_new = name
|
||||
|
||||
if name in tmap:
|
||||
name_new = tmap[name]
|
||||
elif name.endswith(".weight") and name[:-7] in tmap:
|
||||
name_new = tmap[name[:-7]] + ".weight"
|
||||
elif name.endswith(".bias") and name[:-5] in tmap:
|
||||
name_new = tmap[name[:-5]] + ".bias"
|
||||
else:
|
||||
tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
|
||||
if name_new is None:
|
||||
raise Exception(f"Unexpected tensor name: {name}")
|
||||
|
||||
if gguf.should_skip_tensor_TMP(ARCH, params.n_layer, name_new):
|
||||
if tensor_type in should_skip:
|
||||
print(f"skipping tensor {name_new}")
|
||||
continue
|
||||
else:
|
||||
print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type} | {lazy_tensor.shape}")
|
||||
out[name_new] = lazy_tensor
|
||||
|
||||
print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
|
||||
out[name_new] = lazy_tensor
|
||||
|
||||
return out
|
||||
|
||||
def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
|
||||
def nth_multifile_path(path: Path, n: int) -> Path | None:
|
||||
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
||||
the nth path in the model.
|
||||
'''
|
||||
# Support the following patterns:
|
||||
patterns: List[Tuple[str, str]] = [
|
||||
patterns: list[tuple[str, str]] = [
|
||||
# - x.00.pth, x.01.pth, etc.
|
||||
(r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
|
||||
# - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
|
||||
@@ -944,11 +1045,11 @@ def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
|
||||
return None
|
||||
|
||||
|
||||
def find_multifile_paths(path: Path) -> List[Path]:
|
||||
def find_multifile_paths(path: Path) -> list[Path]:
|
||||
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
||||
the whole list of paths in the model.
|
||||
'''
|
||||
ret: List[Path] = []
|
||||
ret: list[Path] = []
|
||||
for i in itertools.count():
|
||||
nth_path = nth_multifile_path(path, i)
|
||||
if nth_path is None:
|
||||
@@ -979,7 +1080,7 @@ def load_some_model(path: Path) -> ModelPlus:
|
||||
path = files[0]
|
||||
|
||||
paths = find_multifile_paths(path)
|
||||
models_plus: List[ModelPlus] = []
|
||||
models_plus: list[ModelPlus] = []
|
||||
for path in paths:
|
||||
print(f"Loading model file {path}")
|
||||
models_plus.append(lazy_load_file(path))
|
||||
@@ -988,7 +1089,7 @@ def load_some_model(path: Path) -> ModelPlus:
|
||||
return model_plus
|
||||
|
||||
|
||||
def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, SentencePieceVocab]:
|
||||
def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
|
||||
# Be extra-friendly and accept either a file or a directory. Also, if it's
|
||||
# a directory, it might be the model directory, and tokenizer.model might
|
||||
# be in the parent of that.
|
||||
@@ -1019,10 +1120,11 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, Sentence
|
||||
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
|
||||
|
||||
|
||||
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
|
||||
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
||||
namestr = {
|
||||
GGMLFileType.AllF32: "f32",
|
||||
GGMLFileType.MostlyF16: "f16",
|
||||
GGMLFileType.MostlyQ8_0:"q8_0",
|
||||
}[file_type]
|
||||
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
|
||||
if ret in model_paths:
|
||||
@@ -1041,24 +1143,33 @@ def do_dump_model(model_plus: ModelPlus) -> None:
|
||||
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
|
||||
|
||||
|
||||
def main(args_in: Optional[List[str]] = None) -> None:
|
||||
def main(args_in: list[str] | None = None) -> None:
|
||||
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
|
||||
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
||||
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
|
||||
parser.add_argument("--outtype", choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
|
||||
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
|
||||
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
|
||||
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
||||
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
|
||||
args = parser.parse_args(args_in)
|
||||
|
||||
if args.dump_single:
|
||||
model_plus = lazy_load_file(args.model)
|
||||
do_dump_model(model_plus)
|
||||
return
|
||||
|
||||
model_plus = load_some_model(args.model)
|
||||
if not args.vocab_only:
|
||||
model_plus = load_some_model(args.model)
|
||||
else:
|
||||
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
|
||||
|
||||
if args.dump:
|
||||
do_dump_model(model_plus)
|
||||
return
|
||||
|
||||
params = Params.load(model_plus)
|
||||
if params.n_ctx == -1:
|
||||
@@ -1073,39 +1184,41 @@ def main(args_in: Optional[List[str]] = None) -> None:
|
||||
params.ftype = {
|
||||
"f32": GGMLFileType.AllF32,
|
||||
"f16": GGMLFileType.MostlyF16,
|
||||
"q8_0": GGMLFileType.MostlyQ8_0,
|
||||
}[args.outtype]
|
||||
|
||||
print(f"params = {params}")
|
||||
|
||||
vocab: Vocab
|
||||
if args.vocab_only:
|
||||
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
|
||||
assert args.outfile, "need --outfile if using --vocab-only"
|
||||
# FIXME: Try to respect vocab_dir somehow?
|
||||
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
|
||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
|
||||
outfile = args.outfile
|
||||
OutputFile.write_vocab_only(outfile, params, vocab)
|
||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
|
||||
print(f"Wrote {outfile}")
|
||||
return
|
||||
|
||||
if model_plus.vocab is not None and args.vocab_dir is None:
|
||||
vocab = model_plus.vocab
|
||||
else:
|
||||
if args.dump:
|
||||
do_dump_model(model_plus)
|
||||
return
|
||||
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
||||
vocab = load_vocab(vocab_dir, args.vocabtype)
|
||||
# FIXME: Try to respect vocab_dir somehow?
|
||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
|
||||
|
||||
if model_plus.vocab is not None and args.vocab_dir is None:
|
||||
vocab = model_plus.vocab
|
||||
else:
|
||||
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
||||
vocab = load_vocab(vocab_dir, args.vocabtype)
|
||||
model = model_plus.model
|
||||
model = convert_model_names(model, params)
|
||||
ftype = pick_output_type(model, args.outtype)
|
||||
model = convert_to_output_type(model, ftype)
|
||||
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
|
||||
|
||||
model = model_plus.model
|
||||
model = convert_model_names(model, params)
|
||||
ftype = pick_output_type(model, args.outtype)
|
||||
model = convert_to_output_type(model, ftype)
|
||||
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
|
||||
params.ftype = ftype
|
||||
print(f"Writing {outfile}, format {ftype}")
|
||||
|
||||
params.ftype = ftype
|
||||
print(f"Writing {outfile}, format {ftype}")
|
||||
|
||||
OutputFile.write_all(outfile, params, model, vocab)
|
||||
print(f"Wrote {outfile}")
|
||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
|
||||
print(f"Wrote {outfile}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -23,9 +23,10 @@ else()
|
||||
add_subdirectory(train-text-from-scratch)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(embd-input)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(beam_search)
|
||||
add_subdirectory(beam-search)
|
||||
if (LLAMA_METAL)
|
||||
add_subdirectory(metal)
|
||||
endif()
|
||||
|
||||
@@ -1617,15 +1617,10 @@ int main(int argc, char ** argv) {
|
||||
|
||||
float error_before_opt = ggml_get_f32_1d(e, 0);
|
||||
|
||||
struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
|
||||
struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
|
||||
opt_params_adam.print_forward_graph = false;
|
||||
opt_params_adam.print_backward_graph = false;
|
||||
opt_params_lbfgs.print_forward_graph = false;
|
||||
opt_params_lbfgs.print_backward_graph = false;
|
||||
opt_params_adam.adam.n_iter = 16;
|
||||
opt_params_lbfgs.lbfgs.n_iter = 16;
|
||||
// ggml_opt(ctx0, opt_params_adam, e);
|
||||
ggml_opt(ctx0, opt_params_lbfgs, e);
|
||||
//
|
||||
ggml_build_forward_expand(&gf, e);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET beam_search)
|
||||
add_executable(${TARGET} beam_search.cpp)
|
||||
set(TARGET beam-search)
|
||||
add_executable(${TARGET} beam-search.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
@@ -22,7 +22,9 @@
|
||||
#include <unistd.h>
|
||||
#elif defined (_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOMINMAX
|
||||
#ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <signal.h>
|
||||
#endif
|
||||
@@ -35,7 +37,7 @@ struct ostream_beam_view {
|
||||
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
|
||||
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
||||
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
||||
os << llama_token_to_str(obv.ctx, obv.beam_view.tokens[i]);
|
||||
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
|
||||
}
|
||||
return os << ')';
|
||||
}
|
||||
@@ -73,7 +75,7 @@ void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_stat
|
||||
assert(0u < beams_state.n_beams);
|
||||
const llama_token * tokens = beams_state.beam_views[0].tokens;
|
||||
std::copy(tokens, tokens + n, callback_data.response.end() - n);
|
||||
printf("%lu", n);
|
||||
printf("%zu", n);
|
||||
}
|
||||
fflush(stdout);
|
||||
#if 1 // DEBUG: print current beams for this iteration
|
||||
@@ -145,7 +147,7 @@ int main(int argc, char ** argv)
|
||||
|
||||
if (tokens_list.size() > max_tokens_list_size)
|
||||
{
|
||||
fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" ,
|
||||
fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
|
||||
__func__ , tokens_list.size() , max_tokens_list_size );
|
||||
return 1;
|
||||
}
|
||||
@@ -156,7 +158,7 @@ int main(int argc, char ** argv)
|
||||
|
||||
for( auto id : tokens_list )
|
||||
{
|
||||
std::cout << llama_token_to_str(ctx, id);
|
||||
std::cout << llama_token_to_piece(ctx, id);
|
||||
}
|
||||
std::cout << std::flush;
|
||||
|
||||
@@ -175,7 +177,7 @@ int main(int argc, char ** argv)
|
||||
|
||||
std::cout << "\n\n";
|
||||
for (llama_token const token_id : callback_data.response) {
|
||||
std::cout << llama_token_to_str(ctx,token_id);
|
||||
std::cout << llama_token_to_piece(ctx,token_id);
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
@@ -11,6 +11,6 @@ cd ..
|
||||
#
|
||||
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
||||
#
|
||||
./main -m ./models/7B/ggml-model-q4_0.bin -c 512 -b 1024 -n 256 --keep 48 \
|
||||
./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
||||
--repeat_penalty 1.0 --color -i \
|
||||
-r "User:" -f prompts/chat-with-bob.txt
|
||||
|
||||
@@ -12,18 +12,14 @@ usage: ./convert-llama2c-to-ggml [options]
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
--copy-vocab-from-model FNAME model path from which to copy vocab (default 'tokenizer.bin')
|
||||
--copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf')
|
||||
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
|
||||
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
|
||||
```
|
||||
|
||||
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
|
||||
|
||||
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model ../llama2.c/tokenizer.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.ggmlv3.bin`
|
||||
|
||||
For now the generated model is in the legacy GGJTv3 format, so you need to convert it to gguf manually:
|
||||
|
||||
`$ python ./convert-llama-ggmlv3-to-gguf.py --eps 1e-5 --input stories42M.ggmlv3.bin --output stories42M.gguf.bin`
|
||||
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
|
||||
|
||||
Now you can use the model with a command like:
|
||||
|
||||
|
||||
@@ -10,9 +10,48 @@
|
||||
#include <ctime>
|
||||
#include <random>
|
||||
#include <stdexcept>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
|
||||
// GGUF keys & tensor names.
|
||||
|
||||
#define KV_GENERAL_ARCHITECTURE "general.architecture"
|
||||
#define KV_GENERAL_NAME "general.name"
|
||||
|
||||
#define KV_TOKENIZER_MODEL "tokenizer.ggml.model"
|
||||
#define KV_TOKENIZER_LIST "tokenizer.ggml.tokens"
|
||||
#define KV_TOKENIZER_TOKEN_TYPE "tokenizer.ggml.token_type"
|
||||
#define KV_TOKENIZER_SCORES "tokenizer.ggml.scores"
|
||||
#define KV_TOKENIZER_BOS_ID "tokenizer.ggml.bos_token_id"
|
||||
#define KV_TOKENIZER_EOS_ID "tokenizer.ggml.eos_token_id"
|
||||
#define KV_TOKENIZER_UNK_ID "tokenizer.ggml.unknown_token_id"
|
||||
#define KV_TOKENIZER_SEP_ID "tokenizer.ggml.seperator_token_id"
|
||||
#define KV_TOKENIZER_PAD_ID "tokenizer.ggml.padding_token_id"
|
||||
#define KV_TOKENIZER_HF_JSON "tokenizer.huggingface.json"
|
||||
|
||||
#define KV_CONTEXT_LENGTH "llama.context_length"
|
||||
#define KV_EMBEDDING_LENGTH "llama.embedding_length"
|
||||
#define KV_BLOCK_COUNT "llama.block_count"
|
||||
#define KV_FEED_FORWARD_LENGTH "llama.feed_forward_length"
|
||||
#define KV_ATTENTION_HEAD_COUNT "llama.attention.head_count"
|
||||
#define KV_ATTENTION_HEAD_COUNT_KV "llama.attention.head_count_kv"
|
||||
#define KV_ATTENTION_LAYERNORM_RMS_EPS "llama.attention.layer_norm_rms_epsilon"
|
||||
#define KV_ROPE_DIMENSION_COUNT "llama.rope.dimension_count"
|
||||
|
||||
#define TN_TOKEN_EMBD "token_embd.weight"
|
||||
#define TN_OUTPUT_NORM "output_norm.weight"
|
||||
#define TN_OUTPUT "output.weight"
|
||||
#define TN_ATTN_NORM "blk.%d.attn_norm.weight"
|
||||
#define TN_ATTN_Q "blk.%d.attn_q.weight"
|
||||
#define TN_ATTN_K "blk.%d.attn_k.weight"
|
||||
#define TN_ATTN_V "blk.%d.attn_v.weight"
|
||||
#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight"
|
||||
#define TN_FFN_NORM "blk.%d.ffn_norm.weight"
|
||||
#define TN_FFN_GATE "blk.%d.ffn_gate.weight"
|
||||
#define TN_FFN_DOWN "blk.%d.ffn_down.weight"
|
||||
#define TN_FFN_UP "blk.%d.ffn_up.weight"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
@@ -20,6 +59,11 @@
|
||||
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||
#define LLAMA_FILE_VERSION_GGJT_V3 3
|
||||
|
||||
#define TOKENIZER_NAME "llama"
|
||||
#define UNKNOWN_TOKEN_ID 0
|
||||
#define BOS_TOKEN_ID 1
|
||||
#define EOS_TOKEN_ID 2
|
||||
|
||||
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
|
||||
typedef struct {
|
||||
int dim; // transformer dimension
|
||||
@@ -31,7 +75,7 @@ typedef struct {
|
||||
int seq_len; // max sequence length
|
||||
} Config;
|
||||
|
||||
typedef struct {
|
||||
struct TransformerWeights {
|
||||
// token embedding table
|
||||
float* token_embedding_table; // (vocab_size, dim)
|
||||
// weights for rmsnorms
|
||||
@@ -53,7 +97,22 @@ typedef struct {
|
||||
// float* freq_cis_imag; // (seq_len, dim/2)
|
||||
// (optional) classifier weights for the logits, on the last layer
|
||||
float* wcls;
|
||||
} TransformerWeights;
|
||||
|
||||
~TransformerWeights() {
|
||||
delete[] token_embedding_table;
|
||||
delete[] rms_att_weight;
|
||||
delete[] rms_ffn_weight;
|
||||
delete[] wq;
|
||||
delete[] wk;
|
||||
delete[] wv;
|
||||
delete[] wo;
|
||||
delete[] w1;
|
||||
delete[] w2;
|
||||
delete[] w3;
|
||||
delete[] rms_final_weight;
|
||||
delete[] wcls;
|
||||
}
|
||||
};
|
||||
|
||||
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
||||
// we calloc instead of malloc to keep valgrind happy
|
||||
@@ -129,21 +188,6 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar
|
||||
return 0;
|
||||
}
|
||||
|
||||
void free_weights(TransformerWeights* w) {
|
||||
delete w->token_embedding_table;
|
||||
delete w->rms_att_weight;
|
||||
delete w->rms_ffn_weight;
|
||||
delete w->wq;
|
||||
delete w->wk;
|
||||
delete w->wv;
|
||||
delete w->wo;
|
||||
delete w->w1;
|
||||
delete w->w2;
|
||||
delete w->w3;
|
||||
delete w->rms_final_weight;
|
||||
if (w->wcls) delete w->wcls;
|
||||
}
|
||||
|
||||
void print_sample_weights(TransformerWeights *w){
|
||||
printf("----- Quick print of first of the weight vales of all the variables\n");
|
||||
printf("%f\n", w->token_embedding_table[0]);
|
||||
@@ -183,6 +227,7 @@ struct my_llama_hparams {
|
||||
uint32_t n_vocab = 32000;
|
||||
uint32_t n_ctx = 512; // this is provided as user input?
|
||||
uint32_t n_embd = 4096;
|
||||
uint32_t n_ff = 11008;
|
||||
uint32_t n_mult = 4;
|
||||
uint32_t n_head = 32;
|
||||
uint32_t n_layer = 32;
|
||||
@@ -214,6 +259,8 @@ struct my_llama_layer {
|
||||
struct my_llama_model {
|
||||
struct ggml_context * ctx = NULL;
|
||||
|
||||
std::string name;
|
||||
|
||||
my_llama_hparams hparams;
|
||||
|
||||
struct ggml_tensor * tok_embeddings;
|
||||
@@ -276,18 +323,13 @@ struct train_params {
|
||||
int mem_compute1_gb;
|
||||
};
|
||||
|
||||
uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
|
||||
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
|
||||
return n_ff;
|
||||
}
|
||||
|
||||
void print_params(struct my_llama_hparams * params) {
|
||||
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
||||
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
||||
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
||||
printf("%s: n_mult: %d\n", __func__, params->n_mult);
|
||||
printf("%s: n_head: %d\n", __func__, params->n_head);
|
||||
printf("%s: n_ff: %d\n", __func__, get_n_ff(params));
|
||||
printf("%s: n_ff: %d\n", __func__, params->n_ff);
|
||||
printf("%s: n_layer: %d\n", __func__, params->n_layer);
|
||||
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
||||
}
|
||||
@@ -299,7 +341,7 @@ void init_model(struct my_llama_model * model) {
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
const uint32_t n_vocab = hparams.n_vocab;
|
||||
|
||||
const uint32_t n_ff = get_n_ff(&hparams);
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
struct ggml_context * ctx = model->ctx;
|
||||
|
||||
model->train_its = 0;
|
||||
@@ -481,21 +523,6 @@ struct llama_file {
|
||||
return std::string(chars.data(), len);
|
||||
}
|
||||
|
||||
void write_raw(const void * ptr, size_t size) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
size_t ret = std::fwrite(ptr, size, 1, fp);
|
||||
if (ret != 1) {
|
||||
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
||||
}
|
||||
}
|
||||
|
||||
void write_u32(std::uint32_t val) {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
~llama_file() {
|
||||
if (fp) {
|
||||
std::fclose(fp);
|
||||
@@ -503,30 +530,6 @@ struct llama_file {
|
||||
}
|
||||
};
|
||||
|
||||
void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
|
||||
if (tensor == NULL) {
|
||||
file->write_u32(0);
|
||||
file->write_u32(0);
|
||||
file->write_u32(GGML_TYPE_F32);
|
||||
file->seek((0-file->tell()) & 31, SEEK_CUR);
|
||||
return;
|
||||
}
|
||||
const char * name = ggml_get_name(tensor);
|
||||
uint32_t name_len = strlen(name);
|
||||
uint32_t nd = tensor->n_dims;
|
||||
uint32_t ne[4] = { (uint32_t)tensor->ne[0],
|
||||
(uint32_t)tensor->ne[1],
|
||||
(uint32_t)tensor->ne[2],
|
||||
(uint32_t)tensor->ne[3] };
|
||||
file->write_u32(nd);
|
||||
file->write_u32(name_len);
|
||||
file->write_u32(tensor->type);
|
||||
file->write_raw(ne, sizeof(ne[0]) * nd);
|
||||
file->write_raw(name, name_len);
|
||||
file->seek((0-file->tell()) & 31, SEEK_CUR);
|
||||
file->write_raw(tensor->data, ggml_nbytes(tensor));
|
||||
}
|
||||
|
||||
bool is_ggml_file(const char *filename) {
|
||||
llama_file file(filename, "rb");
|
||||
if (file.size < 4) {
|
||||
@@ -536,53 +539,105 @@ bool is_ggml_file(const char *filename) {
|
||||
return magic == GGUF_MAGIC;
|
||||
}
|
||||
|
||||
static std::string llama_escape_whitespaces(const std::string& text) {
|
||||
std::ostringstream out;
|
||||
for (char c : text) {
|
||||
if (c == ' ') out << "\xe2\x96\x81";
|
||||
else out << c;
|
||||
}
|
||||
return out.str();
|
||||
}
|
||||
|
||||
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
||||
#pragma message("TODO: implement reading vocabulary using gguf")
|
||||
// // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
|
||||
// if (is_ggml_file(filename)) {
|
||||
//
|
||||
// struct llama_context_params llama_params = llama_context_default_params();
|
||||
// llama_params.vocab_only = true;
|
||||
//
|
||||
// struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
|
||||
// struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
|
||||
//
|
||||
// const int n_vocab = llama_n_vocab(lctx);
|
||||
// vocab->id_to_token.resize(n_vocab);
|
||||
// for (int i=0; i<n_vocab; ++i) {
|
||||
// vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
|
||||
// vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
|
||||
// vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
|
||||
// vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
|
||||
// }
|
||||
// llama_free(lctx);
|
||||
// llama_free_model(lmodel);
|
||||
// } else
|
||||
{ // assume llama2.c vocabulary
|
||||
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
|
||||
if (is_ggml_file(filename)) {
|
||||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ &ctx_data,
|
||||
};
|
||||
|
||||
struct gguf_context * ctx = gguf_init_from_file(filename, params);
|
||||
GGML_ASSERT(ctx != NULL);
|
||||
|
||||
const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL);
|
||||
GGML_ASSERT(model_idx >= 0);
|
||||
std::string tokenizer_name = gguf_get_val_str(ctx, model_idx);
|
||||
GGML_ASSERT(tokenizer_name == TOKENIZER_NAME);
|
||||
|
||||
const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST);
|
||||
GGML_ASSERT(token_idx >= 0);
|
||||
|
||||
const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES);
|
||||
GGML_ASSERT(score_idx >= 0);
|
||||
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
||||
|
||||
const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE);
|
||||
GGML_ASSERT(toktype_idx >= 0);
|
||||
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
||||
|
||||
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
|
||||
|
||||
vocab->id_to_token.resize(n_vocab);
|
||||
|
||||
for (uint32_t i = 0; i < n_vocab; i++) {
|
||||
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
||||
|
||||
vocab->token_to_id[word] = i;
|
||||
|
||||
auto & token_data = vocab->id_to_token[i];
|
||||
token_data.text = std::move(word);
|
||||
token_data.score = scores[i];
|
||||
token_data.type = (llama_token_type) toktypes[i];
|
||||
}
|
||||
ggml_free(ctx_data);
|
||||
gguf_free(ctx);
|
||||
} else {
|
||||
// assume llama2.c vocabulary
|
||||
printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
|
||||
llama_file file(filename, "rb");
|
||||
if (!file.fp) {
|
||||
fprintf(stderr, "error: %s: %s\n", strerror(errno), filename);
|
||||
exit(1);
|
||||
}
|
||||
const int n_vocab = config->vocab_size;
|
||||
/* uint32_t max_token_length = */ file.read_u32(); // unused
|
||||
vocab->id_to_token.resize(n_vocab);
|
||||
for (int i=0; i<n_vocab; ++i) {
|
||||
for (llama_vocab::id id=0; id<n_vocab; ++id) {
|
||||
float_t score = file.read_f32();
|
||||
uint32_t len = file.read_u32();
|
||||
std::string text = file.read_string(len);
|
||||
// Special-case handling of <0xXX> single byte tokens.
|
||||
char byte_val;
|
||||
if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
|
||||
char cstr[2] = { byte_val, 0 };
|
||||
text = cstr;
|
||||
|
||||
unsigned char byte_val;
|
||||
llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
|
||||
if (id == UNKNOWN_TOKEN_ID) {
|
||||
text = "<unk>";
|
||||
type = LLAMA_TOKEN_TYPE_UNKNOWN;
|
||||
} else if (id == BOS_TOKEN_ID) {
|
||||
text = "<s>";
|
||||
type = LLAMA_TOKEN_TYPE_CONTROL;
|
||||
} else if (id == EOS_TOKEN_ID) {
|
||||
text = "</s>";
|
||||
type = LLAMA_TOKEN_TYPE_CONTROL;
|
||||
} else if (text.empty()) {
|
||||
type = LLAMA_TOKEN_TYPE_CONTROL;
|
||||
} else if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
|
||||
// Text of byte tokens is already in the expected format.
|
||||
type = LLAMA_TOKEN_TYPE_BYTE;
|
||||
} else {
|
||||
type = LLAMA_TOKEN_TYPE_NORMAL;
|
||||
}
|
||||
vocab->id_to_token[i].text = text;
|
||||
vocab->id_to_token[i].score = score;
|
||||
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
|
||||
vocab->token_to_id.emplace(text, i);
|
||||
text = llama_escape_whitespaces(text);
|
||||
|
||||
vocab->id_to_token[id].text = text;
|
||||
vocab->id_to_token[id].score = score;
|
||||
vocab->id_to_token[id].type = type;
|
||||
vocab->token_to_id.emplace(text, id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
|
||||
void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
|
||||
int ct;
|
||||
switch (gg_weights->n_dims){
|
||||
case 1:
|
||||
@@ -619,86 +674,120 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * kar
|
||||
}
|
||||
|
||||
void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
|
||||
struct llama_file file(filename, "wb");
|
||||
if (file.fp == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma message("TODO: implement file saving using gguf")
|
||||
// write_magic
|
||||
file.write_u32(LLAMA_FILE_MAGIC_GGJT); // magic
|
||||
file.write_u32(LLAMA_FILE_VERSION_GGJT_V3); // version
|
||||
// write_hparams
|
||||
file.write_u32(model->hparams.n_vocab);
|
||||
file.write_u32(model->hparams.n_embd);
|
||||
file.write_u32(model->hparams.n_mult);
|
||||
file.write_u32(model->hparams.n_head);
|
||||
file.write_u32(model->hparams.n_layer);
|
||||
file.write_u32(model->hparams.n_rot);
|
||||
file.write_u32(LLAMA_FTYPE_ALL_F32);
|
||||
|
||||
// write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
|
||||
uint32_t n_vocab = model->hparams.n_vocab;
|
||||
for (uint32_t i = 0; i < n_vocab; i++) {
|
||||
const auto & token_data = vocab->id_to_token.at(i);
|
||||
file.write_u32((uint32_t) token_data.text.size());
|
||||
file.write_raw(token_data.text.data(), token_data.text.size());
|
||||
file.write_raw(&token_data.score, sizeof(token_data.score));
|
||||
}
|
||||
|
||||
// stuff AK weights into GG weights one by one.
|
||||
// convert AK weights into GG weights one by one.
|
||||
// w->token_embedding_table -> model->tok_embeddings
|
||||
// float* -> struct ggml_tensor
|
||||
stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
|
||||
stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
|
||||
convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
|
||||
convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
|
||||
|
||||
stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
|
||||
convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
|
||||
//print_row(model->norm, 0);
|
||||
|
||||
// for rms-att-weight
|
||||
int row_length = model->hparams.n_embd;
|
||||
const auto & hparams = model->hparams;
|
||||
//int n_ff = model->hparams.n_embd;
|
||||
int n_ff = get_n_ff(&hparams);
|
||||
int n_ff = model->hparams.n_ff;
|
||||
|
||||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
||||
auto & layer = model->layers[i];
|
||||
// 1d
|
||||
stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
|
||||
convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
|
||||
convert_weights_ak_to_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
|
||||
|
||||
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
|
||||
stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
|
||||
convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]);
|
||||
convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]);
|
||||
convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]);
|
||||
convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]);
|
||||
|
||||
stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
|
||||
stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
|
||||
convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
|
||||
convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
||||
convert_weights_ak_to_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
|
||||
}
|
||||
|
||||
struct gguf_context * ctx = gguf_init_empty();
|
||||
|
||||
std::vector<const char*> tokens;
|
||||
std::vector<float> scores;
|
||||
std::vector<llama_token_type> token_types;
|
||||
for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
|
||||
tokens.push_back(token_data.text.c_str());
|
||||
scores.push_back(token_data.score);
|
||||
token_types.push_back(token_data.type);
|
||||
}
|
||||
gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size());
|
||||
gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size());
|
||||
gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size());
|
||||
|
||||
gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME);
|
||||
|
||||
gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama");
|
||||
gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama");
|
||||
|
||||
// special tokens
|
||||
gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
|
||||
gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
|
||||
gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
|
||||
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
|
||||
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
|
||||
|
||||
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
|
||||
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
|
||||
gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
|
||||
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
|
||||
// n_head_kv is optional, default to n_head
|
||||
// gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
|
||||
gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
|
||||
gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
|
||||
gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
|
||||
|
||||
// write tensors
|
||||
write_tensor(&file, model->tok_embeddings);
|
||||
write_tensor(&file, model->norm);
|
||||
write_tensor(&file, model->output); // ?
|
||||
ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD);
|
||||
gguf_add_tensor(ctx, model->tok_embeddings);
|
||||
|
||||
ggml_set_name(model->norm, TN_OUTPUT_NORM);
|
||||
gguf_add_tensor(ctx, model->norm);
|
||||
|
||||
ggml_set_name(model->output, TN_OUTPUT);
|
||||
gguf_add_tensor(ctx, model->output);
|
||||
|
||||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
||||
auto & layer = model->layers[i];
|
||||
|
||||
write_tensor(&file, layer.attention_norm);
|
||||
write_tensor(&file, layer.wq);
|
||||
write_tensor(&file, layer.wk);
|
||||
write_tensor(&file, layer.wv);
|
||||
write_tensor(&file, layer.wo);
|
||||
write_tensor(&file, layer.ffn_norm);
|
||||
write_tensor(&file, layer.w1);
|
||||
write_tensor(&file, layer.w2);
|
||||
write_tensor(&file, layer.w3);
|
||||
ggml_format_name(layer.wq, TN_ATTN_Q, i);
|
||||
gguf_add_tensor(ctx, layer.wq);
|
||||
|
||||
ggml_format_name(layer.wk, TN_ATTN_K, i);
|
||||
gguf_add_tensor(ctx, layer.wk);
|
||||
|
||||
ggml_format_name(layer.wv, TN_ATTN_V, i);
|
||||
gguf_add_tensor(ctx, layer.wv);
|
||||
|
||||
ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i);
|
||||
gguf_add_tensor(ctx, layer.wo);
|
||||
|
||||
ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i);
|
||||
gguf_add_tensor(ctx, layer.attention_norm);
|
||||
|
||||
ggml_format_name(layer.w1, TN_FFN_GATE, i);
|
||||
gguf_add_tensor(ctx, layer.w1);
|
||||
|
||||
ggml_format_name(layer.w2, TN_FFN_DOWN, i);
|
||||
gguf_add_tensor(ctx, layer.w2);
|
||||
|
||||
ggml_format_name(layer.w3, TN_FFN_UP, i);
|
||||
gguf_add_tensor(ctx, layer.w3);
|
||||
|
||||
ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i);
|
||||
gguf_add_tensor(ctx, layer.ffn_norm);
|
||||
}
|
||||
|
||||
gguf_write_to_file(ctx, filename, false);
|
||||
gguf_free(ctx);
|
||||
}
|
||||
|
||||
struct train_params get_default_train_params() {
|
||||
struct train_params params;
|
||||
params.fn_vocab_model = "tokenizer.bin";
|
||||
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
|
||||
params.fn_llama2c_output_model = "ak_llama_model.bin";
|
||||
params.fn_train_data = "shakespeare.txt";
|
||||
params.fn_checkpoint_in = "checkpoint.bin";
|
||||
@@ -751,7 +840,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
|
||||
fprintf(stderr, " --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model);
|
||||
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
|
||||
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
|
||||
fprintf(stderr, "\n");
|
||||
@@ -812,13 +901,21 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string basename(const std::string &path) {
|
||||
size_t pos = path.find_last_of("/\\");
|
||||
if (pos == std::string::npos) {
|
||||
return path;
|
||||
}
|
||||
return path.substr(pos + 1);
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
struct train_params params = get_default_train_params();
|
||||
if (!params_parse(argc, argv, ¶ms)) {
|
||||
return 1;
|
||||
}
|
||||
Config config;
|
||||
TransformerWeights weights;
|
||||
TransformerWeights weights = {};
|
||||
{
|
||||
FILE *file = fopen(params.fn_llama2c_model, "rb");
|
||||
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
|
||||
@@ -840,6 +937,7 @@ int main(int argc, char ** argv) {
|
||||
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
|
||||
model.hparams.n_ctx = params.n_ctx;
|
||||
model.hparams.n_embd = config.dim; //params.n_embd;
|
||||
model.hparams.n_ff = config.hidden_dim;
|
||||
model.hparams.n_mult = 32;//params.n_mult;
|
||||
model.hparams.n_head = config.n_heads; //params.n_head;
|
||||
model.hparams.n_layer = config.n_layers; //params.n_layer;
|
||||
@@ -853,11 +951,11 @@ int main(int argc, char ** argv) {
|
||||
model.ctx = ggml_init(lcparams);
|
||||
|
||||
init_model(&model);
|
||||
model.name = basename(params.fn_llama2c_model);
|
||||
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
|
||||
|
||||
printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
|
||||
|
||||
ggml_free(model.ctx);
|
||||
free_weights(&weights);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -214,7 +214,7 @@ const char * sampling(struct MyModel * mymodel) {
|
||||
if (id == llama_token_eos(ctx)) {
|
||||
ret = "</s>";
|
||||
} else {
|
||||
ret = llama_token_to_str(ctx, id);
|
||||
ret = llama_token_to_piece(ctx, id);
|
||||
}
|
||||
eval_id(mymodel, id);
|
||||
return ret.c_str();
|
||||
|
||||
@@ -56,9 +56,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
int n_past = 0;
|
||||
|
||||
// Add a space in front of the first character to match OG llama tokenizer behavior
|
||||
params.prompt.insert(0, 1, ' ');
|
||||
|
||||
// tokenize the prompt
|
||||
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
@@ -67,7 +64,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
|
||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
5
examples/gguf/CMakeLists.txt
Normal file
5
examples/gguf/CMakeLists.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
set(TARGET gguf)
|
||||
add_executable(${TARGET} gguf.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
@@ -30,6 +30,9 @@ bool gguf_ex_write(const std::string & fname) {
|
||||
gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678);
|
||||
gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679);
|
||||
gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f);
|
||||
gguf_set_val_u64 (ctx, "some.parameter.uint64", 0x123456789abcdef0ull);
|
||||
gguf_set_val_i64 (ctx, "some.parameter.int64", -0x123456789abcdef1ll);
|
||||
gguf_set_val_f64 (ctx, "some.parameter.float64", 0.1234567890123456789);
|
||||
gguf_set_val_bool(ctx, "some.parameter.bool", true);
|
||||
gguf_set_val_str (ctx, "some.parameter.string", "hello world");
|
||||
|
||||
|
||||
@@ -660,9 +660,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
|
||||
ggml_tensor * gpt_neox_ff(
|
||||
const gpt_neox_block &block,
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * inp) {
|
||||
ggml_tensor * inp,
|
||||
const gpt_neox_hparams &hparams) {
|
||||
|
||||
ggml_tensor * cur = ggml_norm(ctx0, inp);
|
||||
ggml_tensor * cur = ggml_norm(ctx0, inp, hparams.norm_eps);
|
||||
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur));
|
||||
cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur);
|
||||
@@ -753,7 +754,7 @@ bool gpt_neox_eval(
|
||||
// self-attention
|
||||
{
|
||||
{
|
||||
cur = ggml_norm(ctx0, inpL);
|
||||
cur = ggml_norm(ctx0, inpL, hparams.norm_eps);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur),
|
||||
@@ -844,7 +845,7 @@ bool gpt_neox_eval(
|
||||
if (hparams.par_res == 0) {
|
||||
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF);
|
||||
cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF, hparams);
|
||||
|
||||
// input for next layer
|
||||
inpL = ggml_add(ctx0, cur, inpFF);
|
||||
@@ -853,7 +854,7 @@ bool gpt_neox_eval(
|
||||
|
||||
// this is independent of the self-attention result, so it could be done in parallel to the self-attention
|
||||
// note here we pass inpL instead of cur
|
||||
cur = gpt_neox_ff(model.blocks[il], ctx0, inpL);
|
||||
cur = gpt_neox_ff(model.blocks[il], ctx0, inpL, hparams);
|
||||
|
||||
// layer input + FF
|
||||
cur = ggml_add(ctx0, cur, inpFF);
|
||||
@@ -867,7 +868,7 @@ bool gpt_neox_eval(
|
||||
|
||||
// norm
|
||||
{
|
||||
inpL = ggml_norm(ctx0, inpL);
|
||||
inpL = ggml_norm(ctx0, inpL, hparams.norm_eps);
|
||||
|
||||
// inpL = ln_f_g*inpL + ln_f_b
|
||||
inpL = ggml_add(ctx0,
|
||||
|
||||
7
examples/llama-bench/llama-bench.cpp
Executable file → Normal file
7
examples/llama-bench/llama-bench.cpp
Executable file → Normal file
@@ -3,6 +3,9 @@
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <cinttypes>
|
||||
#include <clocale>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <iterator>
|
||||
@@ -10,7 +13,6 @@
|
||||
#include <numeric>
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@@ -916,6 +918,9 @@ static void llama_null_log_callback(enum llama_log_level level, const char * tex
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
// try to set locale for unicode characters in markdown
|
||||
setlocale(LC_CTYPE, ".UTF-8");
|
||||
|
||||
#if !defined(NDEBUG)
|
||||
fprintf(stderr, "warning: asserts enabled, performance may be affected\n");
|
||||
#endif
|
||||
|
||||
@@ -8,7 +8,7 @@ function! Llm()
|
||||
let buffer_content = join(getline(1, '$'), "\n")
|
||||
|
||||
" Create the JSON payload
|
||||
let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false}
|
||||
let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":256,"stop": ["\n\n\n"],"stream": v:false}
|
||||
let json_payload.prompt = buffer_content
|
||||
|
||||
" Define the curl command
|
||||
@@ -25,3 +25,4 @@ function! Llm()
|
||||
endfunction
|
||||
|
||||
command! Llm call Llm()
|
||||
noremap <F2> :Llm<CR>
|
||||
|
||||
@@ -34,7 +34,7 @@ For an interactive experience, try this command:
|
||||
#### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \
|
||||
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
|
||||
'User: Hi
|
||||
AI: Hello. I am an AI chatbot. Would you like to talk?
|
||||
User: Sure!
|
||||
@@ -45,7 +45,7 @@ User:'
|
||||
#### Windows:
|
||||
|
||||
```powershell
|
||||
main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
|
||||
main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
|
||||
```
|
||||
|
||||
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#endif
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include "console.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
@@ -17,6 +18,7 @@
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@@ -36,9 +38,57 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
static llama_context ** g_ctx;
|
||||
static llama_context ** g_ctx;
|
||||
static llama_model ** g_model;
|
||||
static gpt_params * g_params;
|
||||
static std::vector<llama_token> * g_input_tokens;
|
||||
static std::ostringstream * g_output_ss;
|
||||
static std::vector<llama_token> * g_output_tokens;
|
||||
static bool is_interacting = false;
|
||||
|
||||
void write_logfile(
|
||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
||||
const std::vector<llama_token> input_tokens, const std::string output, const std::vector<llama_token> output_tokens) {
|
||||
|
||||
if (params.logdir.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const std::string timestamp = get_sortable_timestamp();
|
||||
|
||||
const bool success = create_directory_with_parents(params.logdir);
|
||||
if (!success) {
|
||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
||||
__func__, params.logdir.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
const std::string logfile_path = params.logdir + timestamp + ".yml";
|
||||
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
||||
|
||||
if (logfile == NULL) {
|
||||
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(logfile, "binary: main\n");
|
||||
char model_desc[128];
|
||||
llama_model_desc(model, model_desc, sizeof(model_desc));
|
||||
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
||||
|
||||
fprintf(logfile, "\n");
|
||||
fprintf(logfile, "######################\n");
|
||||
fprintf(logfile, "# Generation Results #\n");
|
||||
fprintf(logfile, "######################\n");
|
||||
fprintf(logfile, "\n");
|
||||
|
||||
dump_string_yaml_multiline(logfile, "output", output.c_str());
|
||||
dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
|
||||
|
||||
llama_dump_timing_info_yaml(logfile, ctx);
|
||||
fclose(logfile);
|
||||
}
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
void sigint_handler(int signo) {
|
||||
if (signo == SIGINT) {
|
||||
@@ -48,6 +98,7 @@ void sigint_handler(int signo) {
|
||||
console::cleanup();
|
||||
printf("\n");
|
||||
llama_print_timings(*g_ctx);
|
||||
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
||||
_exit(130);
|
||||
}
|
||||
}
|
||||
@@ -56,11 +107,21 @@ void sigint_handler(int signo) {
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
g_params = ¶ms;
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("main", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
// TODO: Dump params ?
|
||||
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
|
||||
|
||||
// save choice to use color for later
|
||||
// (note for later: this is a slightly awkward choice)
|
||||
console::init(params.simple_io, params.use_color);
|
||||
@@ -83,42 +144,37 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (params.rope_freq_base != 10000.0) {
|
||||
fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
|
||||
LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
|
||||
}
|
||||
|
||||
if (params.rope_freq_scale != 1.0) {
|
||||
fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
|
||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
|
||||
}
|
||||
|
||||
if (params.n_ctx > 2048) {
|
||||
// TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
|
||||
fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
|
||||
} else if (params.n_ctx < 8) {
|
||||
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
||||
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
}
|
||||
|
||||
LOG("%s: llama backend init\n", __func__);
|
||||
llama_backend_init(params.numa);
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
llama_context * ctx_guidance = NULL;
|
||||
g_model = &model;
|
||||
g_ctx = &ctx;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (params.cfg_scale > 1.f) {
|
||||
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
||||
@@ -126,14 +182,21 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
LOG_TEE("%s: error: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.n_ctx > llama_n_ctx(ctx)) {
|
||||
LOG_TEE("%s: warning: base model only supports context sizes no greater than %d tokens (%d specified)\n", __func__, llama_n_ctx(ctx), params.n_ctx);
|
||||
} else if (params.n_ctx < 8) {
|
||||
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("system_info: n_threads = %d / %d | %s\n",
|
||||
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
||||
}
|
||||
|
||||
@@ -141,7 +204,7 @@ int main(int argc, char ** argv) {
|
||||
// uncomment the "used_mem" line in llama.cpp to see the results
|
||||
if (params.mem_test) {
|
||||
{
|
||||
fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
|
||||
LOG_TEE("%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
|
||||
|
||||
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
|
||||
llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
|
||||
@@ -167,7 +230,7 @@ int main(int argc, char ** argv) {
|
||||
std::vector<llama_token> session_tokens;
|
||||
|
||||
if (!path_session.empty()) {
|
||||
fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
||||
LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
||||
|
||||
// fopen to check for existing session
|
||||
FILE * fp = std::fopen(path_session.c_str(), "rb");
|
||||
@@ -177,38 +240,38 @@ int main(int argc, char ** argv) {
|
||||
session_tokens.resize(params.n_ctx);
|
||||
size_t n_token_count_out = 0;
|
||||
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
||||
fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
|
||||
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
|
||||
return 1;
|
||||
}
|
||||
session_tokens.resize(n_token_count_out);
|
||||
llama_set_rng_seed(ctx, params.seed);
|
||||
|
||||
fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
|
||||
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
|
||||
} else {
|
||||
fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
|
||||
LOG_TEE("%s: session file does not exist, will create\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
// Add BOS if SPM tokenizer
|
||||
const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||
LOG("add_bos: %d\n", add_bos);
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
||||
if (llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM) {
|
||||
// Add a space in front of the first character to match OG llama tokenizer behavior
|
||||
params.prompt.insert(0, 1, ' ');
|
||||
}
|
||||
|
||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
||||
LOG("tokenize the prompt\n");
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
} else {
|
||||
LOG("use session tokens\n");
|
||||
embd_inp = session_tokens;
|
||||
}
|
||||
|
||||
LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
|
||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
|
||||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
embd_inp.push_back(llama_token_bos(ctx));
|
||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
|
||||
}
|
||||
|
||||
// Tokenize negative prompt
|
||||
@@ -216,24 +279,31 @@ int main(int argc, char ** argv) {
|
||||
int guidance_offset = 0;
|
||||
int original_prompt_len = 0;
|
||||
if (ctx_guidance) {
|
||||
params.cfg_negative_prompt.insert(0, 1, ' ');
|
||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
|
||||
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
|
||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
||||
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
|
||||
|
||||
original_prompt_len = original_inp.size();
|
||||
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
||||
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
|
||||
LOG("guidance_offset: %s", log_tostr(guidance_offset));
|
||||
}
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
LOG("n_ctx: %d\n", n_ctx);
|
||||
|
||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||
fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// debug message about similarity of saved session, if applicable
|
||||
size_t n_matching_session_tokens = 0;
|
||||
if (session_tokens.size()) {
|
||||
if (session_tokens.size() > 0) {
|
||||
for (llama_token id : session_tokens) {
|
||||
if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
|
||||
break;
|
||||
@@ -241,22 +311,27 @@ int main(int argc, char ** argv) {
|
||||
n_matching_session_tokens++;
|
||||
}
|
||||
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
|
||||
fprintf(stderr, "%s: using full prompt from session file\n", __func__);
|
||||
LOG_TEE("%s: using full prompt from session file\n", __func__);
|
||||
} else if (n_matching_session_tokens >= embd_inp.size()) {
|
||||
fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
|
||||
LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
|
||||
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
|
||||
fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
||||
LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
||||
__func__, n_matching_session_tokens, embd_inp.size());
|
||||
} else {
|
||||
fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
|
||||
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
|
||||
__func__, n_matching_session_tokens, embd_inp.size());
|
||||
}
|
||||
}
|
||||
|
||||
LOGLN(
|
||||
"recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
|
||||
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
|
||||
|
||||
// if we will use the cache for the full prompt without reaching the end of the cache, force
|
||||
// reevaluation of the last token token to recalculate the cached logits
|
||||
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
|
||||
session_tokens.size() > embd_inp.size()) {
|
||||
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
|
||||
LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
|
||||
|
||||
session_tokens.resize(embd_inp.size() - 1);
|
||||
}
|
||||
|
||||
@@ -269,6 +344,9 @@ int main(int argc, char ** argv) {
|
||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
|
||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
|
||||
|
||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
|
||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
|
||||
|
||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||
if (params.instruct) {
|
||||
params.interactive_first = true;
|
||||
@@ -281,30 +359,30 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (params.verbose_prompt) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
|
||||
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
|
||||
if (ctx_guidance) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
||||
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
||||
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
|
||||
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if (params.n_keep > 0) {
|
||||
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
|
||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
|
||||
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
fprintf(stderr, "'\n");
|
||||
LOG_TEE("'\n");
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
LOG_TEE("\n");
|
||||
}
|
||||
|
||||
if (params.interactive) {
|
||||
@@ -321,47 +399,48 @@ int main(int argc, char ** argv) {
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
|
||||
fprintf(stderr, "%s: interactive mode on.\n", __func__);
|
||||
LOG_TEE("%s: interactive mode on.\n", __func__);
|
||||
|
||||
if (params.antiprompt.size()) {
|
||||
for (auto antiprompt : params.antiprompt) {
|
||||
fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
|
||||
for (const auto & antiprompt : params.antiprompt) {
|
||||
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
fprintf(stderr, "Input prefix with BOS\n");
|
||||
LOG_TEE("Input prefix with BOS\n");
|
||||
}
|
||||
|
||||
if (!params.input_prefix.empty()) {
|
||||
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
}
|
||||
|
||||
if (!params.input_suffix.empty()) {
|
||||
fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
||||
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
||||
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
||||
fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
fprintf(stderr, "\n\n");
|
||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
struct llama_grammar * grammar = NULL;
|
||||
grammar_parser::parse_state parsed_grammar;
|
||||
llama_grammar * grammar = NULL;
|
||||
|
||||
if (!params.grammar.empty()) {
|
||||
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
|
||||
// will be empty (default) if there are parse errors
|
||||
if (parsed_grammar.rules.empty()) {
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "%s: grammar:\n", __func__);
|
||||
LOG_TEE("%s: grammar:\n", __func__);
|
||||
grammar_parser::print_grammar(stderr, parsed_grammar);
|
||||
fprintf(stderr, "\n");
|
||||
LOG_TEE("\n");
|
||||
|
||||
{
|
||||
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
||||
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
||||
fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
||||
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -371,8 +450,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// TODO: replace with ring-buffer
|
||||
std::vector<llama_token> last_n_tokens(n_ctx);
|
||||
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
||||
std::vector<llama_token> last_tokens(n_ctx);
|
||||
std::fill(last_tokens.begin(), last_tokens.end(), 0);
|
||||
|
||||
if (params.interactive) {
|
||||
const char *control_message;
|
||||
@@ -384,11 +463,11 @@ int main(int argc, char ** argv) {
|
||||
" - To return control without starting a new line, end your input with '/'.\n"
|
||||
" - If you want to submit another line, end your input with '\\'.\n";
|
||||
}
|
||||
fprintf(stderr, "== Running in interactive mode. ==\n"
|
||||
LOG_TEE("== Running in interactive mode. ==\n");
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
" - Press Ctrl+C to interject at any time.\n"
|
||||
LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
|
||||
#endif
|
||||
"%s\n", control_message);
|
||||
LOG_TEE( "%s\n", control_message);
|
||||
|
||||
is_interacting = params.interactive_first;
|
||||
}
|
||||
@@ -403,33 +482,37 @@ int main(int argc, char ** argv) {
|
||||
int n_session_consumed = 0;
|
||||
int n_past_guidance = 0;
|
||||
|
||||
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
||||
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
||||
std::ostringstream output_ss; g_output_ss = &output_ss;
|
||||
|
||||
// the first thing we will do is to output the prompt, so set color accordingly
|
||||
console::set_display(console::prompt);
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
std::vector<llama_token> embd_guidance;
|
||||
|
||||
// do one empty run to warm up the model
|
||||
{
|
||||
const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
|
||||
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||
llama_reset_timings(ctx);
|
||||
}
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
|
||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||
// predict
|
||||
if (embd.size() > 0) {
|
||||
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
|
||||
// --prompt or --file which uses the same value.
|
||||
auto max_embd_size = n_ctx - 4;
|
||||
int max_embd_size = n_ctx - 4;
|
||||
|
||||
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
|
||||
if ((int)embd.size() > max_embd_size) {
|
||||
auto skipped_tokens = embd.size() - max_embd_size;
|
||||
if ((int) embd.size() > max_embd_size) {
|
||||
const int skipped_tokens = (int) embd.size() - max_embd_size;
|
||||
embd.resize(max_embd_size);
|
||||
|
||||
console::set_display(console::error);
|
||||
printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||
console::set_display(console::reset);
|
||||
fflush(stdout);
|
||||
embd.resize(max_embd_size);
|
||||
}
|
||||
|
||||
// infinite text generation via context swapping
|
||||
@@ -438,28 +521,26 @@ int main(int argc, char ** argv) {
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
|
||||
if (params.n_predict == -2) {
|
||||
fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__);
|
||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
}
|
||||
|
||||
const int n_left = n_past - params.n_keep;
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep);
|
||||
|
||||
// always keep the first token - BOS
|
||||
n_past = std::max(1, params.n_keep);
|
||||
n_past = std::max(1, params.n_keep);
|
||||
n_past_guidance = std::max(1, params.n_keep + guidance_offset);
|
||||
|
||||
// insert n_left/2 tokens at the start of embd from last_n_tokens
|
||||
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
|
||||
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
|
||||
|
||||
// stop saving session if we run out of context
|
||||
// insert n_left/2 tokens at the start of embd from last_tokens
|
||||
embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
|
||||
|
||||
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
|
||||
|
||||
LOG("clear session path\n");
|
||||
path_session.clear();
|
||||
|
||||
//printf("\n---\n");
|
||||
//printf("resetting: '");
|
||||
//for (int i = 0; i < (int) embd.size(); i++) {
|
||||
// printf("%s", llama_token_to_str(ctx, embd[i]));
|
||||
//}
|
||||
//printf("'\n");
|
||||
//printf("\n---\n");
|
||||
}
|
||||
|
||||
// try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
|
||||
@@ -489,7 +570,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (ctx_guidance) {
|
||||
int input_size = 0;
|
||||
llama_token* input_buf = NULL;
|
||||
llama_token * input_buf = NULL;
|
||||
|
||||
if (n_past_guidance < (int) guidance_inp.size()) {
|
||||
// Guidance context should have the same data with these modifications:
|
||||
@@ -505,22 +586,19 @@ int main(int argc, char ** argv) {
|
||||
);
|
||||
}
|
||||
|
||||
input_buf = embd_guidance.data();
|
||||
input_buf = embd_guidance.data();
|
||||
input_size = embd_guidance.size();
|
||||
//fprintf(stderr, "\n---------------------\n");
|
||||
//for (int i = 0; i < (int) embd_guidance.size(); i++) {
|
||||
//fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
|
||||
//}
|
||||
//fprintf(stderr, "\n---------------------\n");
|
||||
|
||||
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
|
||||
} else {
|
||||
input_buf = embd.data();
|
||||
input_buf = embd.data();
|
||||
input_size = embd.size();
|
||||
}
|
||||
|
||||
for (int i = 0; i < input_size; i += params.n_batch) {
|
||||
int n_eval = std::min(input_size - i, params.n_batch);
|
||||
if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -533,11 +611,17 @@ int main(int argc, char ** argv) {
|
||||
if (n_eval > params.n_batch) {
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
|
||||
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
|
||||
|
||||
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
n_past += n_eval;
|
||||
|
||||
LOG("n_past = %d\n", n_past);
|
||||
}
|
||||
|
||||
if (embd.size() > 0 && !path_session.empty()) {
|
||||
@@ -550,106 +634,21 @@ int main(int argc, char ** argv) {
|
||||
embd_guidance.clear();
|
||||
|
||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||
// out of user input, sample next token
|
||||
const float temp = params.temp;
|
||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float tfs_z = params.tfs_z;
|
||||
const float typical_p = params.typical_p;
|
||||
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
||||
const float repeat_penalty = params.repeat_penalty;
|
||||
const float alpha_presence = params.presence_penalty;
|
||||
const float alpha_frequency = params.frequency_penalty;
|
||||
const int mirostat = params.mirostat;
|
||||
const float mirostat_tau = params.mirostat_tau;
|
||||
const float mirostat_eta = params.mirostat_eta;
|
||||
const bool penalize_nl = params.penalize_nl;
|
||||
|
||||
// optionally save the session on first sample (for faster prompt loading next time)
|
||||
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
|
||||
need_to_save_session = false;
|
||||
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
|
||||
LOG("saved session to %s\n", path_session.c_str());
|
||||
}
|
||||
|
||||
llama_token id = 0;
|
||||
const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
|
||||
|
||||
{
|
||||
auto logits = llama_get_logits(ctx);
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
last_tokens.erase(last_tokens.begin());
|
||||
last_tokens.push_back(id);
|
||||
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
||||
logits[it->first] += it->second;
|
||||
}
|
||||
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
if (ctx_guidance) {
|
||||
llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale);
|
||||
}
|
||||
|
||||
// Apply penalties
|
||||
float nl_logit = logits[llama_token_nl(ctx)];
|
||||
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
||||
llama_sample_repetition_penalty(ctx, &candidates_p,
|
||||
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
last_n_repeat, repeat_penalty);
|
||||
llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
||||
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
last_n_repeat, alpha_frequency, alpha_presence);
|
||||
if (!penalize_nl) {
|
||||
for (size_t idx = 0; idx < candidates_p.size; idx++) {
|
||||
if (candidates_p.data[idx].id == llama_token_nl(ctx)) {
|
||||
candidates_p.data[idx].logit = nl_logit;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (grammar != NULL) {
|
||||
llama_sample_grammar(ctx, &candidates_p, grammar);
|
||||
}
|
||||
|
||||
if (temp <= 0) {
|
||||
// Greedy sampling
|
||||
id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
} else {
|
||||
if (mirostat == 1) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
const int mirostat_m = 100;
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
||||
} else if (mirostat == 2) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||
} else {
|
||||
// Temperature sampling
|
||||
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
|
||||
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
|
||||
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
|
||||
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token(ctx, &candidates_p);
|
||||
}
|
||||
}
|
||||
// printf("`%d`", candidates_p.size);
|
||||
|
||||
if (grammar != NULL) {
|
||||
llama_grammar_accept_token(ctx, grammar, id);
|
||||
}
|
||||
|
||||
last_n_tokens.erase(last_n_tokens.begin());
|
||||
last_n_tokens.push_back(id);
|
||||
}
|
||||
|
||||
// add it to the context
|
||||
embd.push_back(id);
|
||||
|
||||
// echo this to console
|
||||
@@ -657,12 +656,15 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// decrement remaining sampling budget
|
||||
--n_remain;
|
||||
|
||||
LOG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
// some user input remains from prompt or interaction, forward it to processing
|
||||
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||
while ((int) embd_inp.size() > n_consumed) {
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
last_n_tokens.erase(last_n_tokens.begin());
|
||||
last_n_tokens.push_back(embd_inp[n_consumed]);
|
||||
last_tokens.erase(last_tokens.begin());
|
||||
last_tokens.push_back(embd_inp[n_consumed]);
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params.n_batch) {
|
||||
break;
|
||||
@@ -673,23 +675,30 @@ int main(int argc, char ** argv) {
|
||||
// display text
|
||||
if (input_echo) {
|
||||
for (auto id : embd) {
|
||||
printf("%s", llama_token_to_str(ctx, id).c_str());
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
printf("%s", token_str.c_str());
|
||||
|
||||
if (embd.size() > 1) {
|
||||
input_tokens.push_back(id);
|
||||
} else {
|
||||
output_tokens.push_back(id);
|
||||
output_ss << token_str;
|
||||
}
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
// reset color to default if we there is no pending user input
|
||||
if (input_echo && (int)embd_inp.size() == n_consumed) {
|
||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||
console::set_display(console::reset);
|
||||
}
|
||||
|
||||
// if not currently processing queued inputs;
|
||||
if ((int) embd_inp.size() <= n_consumed) {
|
||||
|
||||
// check for reverse prompt
|
||||
if (params.antiprompt.size()) {
|
||||
std::string last_output;
|
||||
for (auto id : last_n_tokens) {
|
||||
last_output += llama_token_to_str(ctx, id);
|
||||
for (auto id : last_tokens) {
|
||||
last_output += llama_token_to_piece(ctx, id);
|
||||
}
|
||||
|
||||
is_antiprompt = false;
|
||||
@@ -702,7 +711,7 @@ int main(int argc, char ** argv) {
|
||||
? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
|
||||
: 0;
|
||||
|
||||
if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
|
||||
if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
|
||||
if (params.interactive) {
|
||||
is_interacting = true;
|
||||
console::set_display(console::user_input);
|
||||
@@ -712,10 +721,16 @@ int main(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_antiprompt) {
|
||||
LOG("found antiprompt: %s\n", last_output.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// deal with end of text token in interactive mode
|
||||
if (last_n_tokens.back() == llama_token_eos(ctx)) {
|
||||
if (last_tokens.back() == llama_token_eos(ctx)) {
|
||||
LOG("found EOS token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
if (params.antiprompt.size() != 0) {
|
||||
// tokenize and inject first reverse prompt
|
||||
@@ -734,16 +749,20 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (n_past > 0 && is_interacting) {
|
||||
LOG("waiting for user input\n");
|
||||
|
||||
if (params.instruct) {
|
||||
printf("\n> ");
|
||||
}
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG("adding input prefix BOS token\n");
|
||||
embd_inp.push_back(llama_token_bos(ctx));
|
||||
}
|
||||
|
||||
std::string buffer;
|
||||
if (!params.input_prefix.empty()) {
|
||||
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
buffer += params.input_prefix;
|
||||
printf("%s", buffer.c_str());
|
||||
}
|
||||
@@ -763,25 +782,43 @@ int main(int argc, char ** argv) {
|
||||
if (buffer.length() > 1) {
|
||||
// append input suffix if any
|
||||
if (!params.input_suffix.empty()) {
|
||||
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
buffer += params.input_suffix;
|
||||
printf("%s", params.input_suffix.c_str());
|
||||
}
|
||||
|
||||
LOG("buffer: '%s'\n", buffer.c_str());
|
||||
|
||||
const size_t original_size = embd_inp.size();
|
||||
|
||||
// instruct mode: insert instruction prefix
|
||||
if (params.instruct && !is_antiprompt) {
|
||||
LOG("inserting instruction prefix\n");
|
||||
n_consumed = embd_inp.size();
|
||||
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
||||
}
|
||||
|
||||
auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
|
||||
|
||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||
|
||||
// instruct mode: insert response suffix
|
||||
if (params.instruct) {
|
||||
LOG("inserting instruction suffix\n");
|
||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||
}
|
||||
|
||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||
const llama_token token = embd_inp[i];
|
||||
output_tokens.push_back(token);
|
||||
output_ss << llama_token_to_piece(ctx, token);
|
||||
}
|
||||
|
||||
n_remain -= line_inp.size();
|
||||
LOG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
LOG("empty line, passing control back\n");
|
||||
}
|
||||
|
||||
input_echo = false; // do not echo this again
|
||||
@@ -793,7 +830,7 @@ int main(int argc, char ** argv) {
|
||||
if (grammar != NULL) {
|
||||
llama_grammar_free(grammar);
|
||||
|
||||
std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
|
||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
||||
grammar = llama_grammar_init(
|
||||
grammar_rules.data(), grammar_rules.size(),
|
||||
parsed_grammar.symbol_ids.at("root"));
|
||||
@@ -805,7 +842,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// end of text token
|
||||
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
|
||||
fprintf(stderr, " [end of text]\n");
|
||||
LOG_TEE(" [end of text]\n");
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -818,11 +855,13 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
||||
fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
||||
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
||||
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||
|
||||
if (ctx_guidance) { llama_free(ctx_guidance); }
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
@@ -832,5 +871,9 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
llama_backend_free();
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
LOG_TEE("Log end\n")
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -3,16 +3,79 @@
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <sstream>
|
||||
#include <cstring>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
struct results_perplexity {
|
||||
std::vector<llama_token> tokens;
|
||||
double ppl_value;
|
||||
std::vector<float> logits;
|
||||
std::vector<float> probs;
|
||||
};
|
||||
|
||||
struct results_log_softmax {
|
||||
double log_softmax;
|
||||
float logit;
|
||||
float prob;
|
||||
};
|
||||
|
||||
void write_logfile(const llama_context * ctx, const gpt_params & params,
|
||||
const llama_model * model, const struct results_perplexity & results) {
|
||||
|
||||
if (params.logdir.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (params.hellaswag) {
|
||||
fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
const std::string timestamp = get_sortable_timestamp();
|
||||
|
||||
const bool success = create_directory_with_parents(params.logdir);
|
||||
if (!success) {
|
||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
||||
__func__, params.logdir.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
const std::string logfile_path = params.logdir + timestamp + ".yml";
|
||||
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
||||
|
||||
if (logfile == NULL) {
|
||||
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(logfile, "binary: main\n");
|
||||
char model_desc[128];
|
||||
llama_model_desc(model, model_desc, sizeof(model_desc));
|
||||
dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc);
|
||||
|
||||
fprintf(logfile, "\n");
|
||||
fprintf(logfile, "######################\n");
|
||||
fprintf(logfile, "# Perplexity Results #\n");
|
||||
fprintf(logfile, "######################\n");
|
||||
fprintf(logfile, "\n");
|
||||
|
||||
dump_vector_float_yaml(logfile, "logits", results.logits);
|
||||
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
|
||||
dump_vector_float_yaml(logfile, "probs", results.probs);
|
||||
|
||||
llama_dump_timing_info_yaml(logfile, ctx);
|
||||
fclose(logfile);
|
||||
}
|
||||
|
||||
std::vector<float> softmax(const std::vector<float>& logits) {
|
||||
std::vector<float> probs(logits.size());
|
||||
float max_logit = logits[0];
|
||||
@@ -29,20 +92,20 @@ std::vector<float> softmax(const std::vector<float>& logits) {
|
||||
return probs;
|
||||
}
|
||||
|
||||
float log_softmax(int n_vocab, const float * logits, int tok) {
|
||||
results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
|
||||
float max_logit = logits[0];
|
||||
for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
|
||||
double sum_exp = 0.0;
|
||||
for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
|
||||
return logits[tok] - max_logit - log(sum_exp);
|
||||
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
|
||||
}
|
||||
|
||||
void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread>& workers,
|
||||
double& nll, double& nll2) {
|
||||
void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
|
||||
double & nll, double & nll2, float * logit_history, float * prob_history) {
|
||||
|
||||
std::mutex mutex;
|
||||
int counter = 0;
|
||||
auto compute = [&mutex, &counter, &nll, &nll2, n_vocab, logits, tokens, n_token] () {
|
||||
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
|
||||
double local_nll = 0, local_nll2 = 0;
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
@@ -52,34 +115,51 @@ void process_logits(int n_vocab, const float * logits, const int * tokens, int n
|
||||
break;
|
||||
}
|
||||
lock.unlock();
|
||||
double v = -log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
|
||||
const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
|
||||
const double v = -results.log_softmax;
|
||||
local_nll += v;
|
||||
local_nll2 += v*v;
|
||||
|
||||
logit_history[i] = results.logit;
|
||||
prob_history[i] = results.prob;
|
||||
}
|
||||
};
|
||||
for (auto& w : workers) w = std::thread(compute);
|
||||
for (auto & w : workers) w = std::thread(compute);
|
||||
compute();
|
||||
for (auto& w : workers) w.join();
|
||||
for (auto & w : workers) w.join();
|
||||
|
||||
}
|
||||
|
||||
void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
// Output: `perplexity: 13.5106 [114/114]`
|
||||
// BOS tokens will be added for each chunk before eval
|
||||
|
||||
if (params.ppl_stride <= 0) {
|
||||
fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
|
||||
return;
|
||||
}
|
||||
|
||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||
const bool add_bos = is_spm;
|
||||
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
if (int(tokens.size()) < 2*params.n_ctx) {
|
||||
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx,
|
||||
params.n_ctx);
|
||||
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
||||
return {std::move(tokens), 0., {}, {}};
|
||||
}
|
||||
|
||||
std::vector<float> logit_history;
|
||||
std::vector<float> prob_history;
|
||||
|
||||
logit_history.resize(tokens.size());
|
||||
prob_history.resize(tokens.size());
|
||||
|
||||
if (params.ppl_stride <= 0) {
|
||||
fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
|
||||
return {tokens, -1, logit_history, prob_history};
|
||||
}
|
||||
|
||||
const int calc_chunk = params.n_ctx;
|
||||
|
||||
@@ -88,7 +168,7 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
if (int(tokens.size()) <= calc_chunk) {
|
||||
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
|
||||
tokens.size(), params.n_ctx, params.ppl_stride);
|
||||
return;
|
||||
return {tokens, -1, logit_history, prob_history};
|
||||
}
|
||||
|
||||
const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride;
|
||||
@@ -120,7 +200,7 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
|
||||
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
|
||||
//fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return;
|
||||
return {tokens, -1, logit_history, prob_history};
|
||||
}
|
||||
|
||||
// save original token and restore it after eval
|
||||
@@ -161,6 +241,8 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
logits.begin() + (j + 1) * n_vocab);
|
||||
|
||||
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
|
||||
logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]];
|
||||
prob_history[start + j + 1] = prob;
|
||||
|
||||
nll += -std::log(prob);
|
||||
++count;
|
||||
@@ -174,12 +256,14 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
fflush(stdout);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
return {tokens, std::exp(nll / count), logit_history, prob_history};
|
||||
}
|
||||
|
||||
void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
if (params.ppl_stride > 0) {
|
||||
perplexity_v2(ctx, params);
|
||||
return;
|
||||
return perplexity_v2(ctx, params);
|
||||
}
|
||||
|
||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
@@ -190,9 +274,26 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||
const bool add_bos = is_spm;
|
||||
|
||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||
|
||||
if (int(tokens.size()) < 2*params.n_ctx) {
|
||||
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx,
|
||||
params.n_ctx);
|
||||
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
||||
return {std::move(tokens), 0., {}, {}};
|
||||
}
|
||||
|
||||
std::vector<float> logit_history;
|
||||
logit_history.resize(tokens.size());
|
||||
|
||||
std::vector<float> prob_history;
|
||||
prob_history.resize(tokens.size());
|
||||
|
||||
const int n_chunk_max = tokens.size() / params.n_ctx;
|
||||
|
||||
@@ -232,7 +333,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return;
|
||||
return {tokens, -1, logit_history, prob_history};
|
||||
}
|
||||
|
||||
// restore the original token in case it was set to BOS
|
||||
@@ -267,8 +368,9 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
// Example, we have a context window of 512, we will compute perplexity for each of the
|
||||
// last 256 tokens. Then, we split the input up into context window size chunks to
|
||||
// process the entire prompt.
|
||||
const int first = std::min(512, params.n_ctx/2);
|
||||
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, workers, nll, nll2);
|
||||
const int first = params.n_ctx/2;
|
||||
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
|
||||
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
||||
count += params.n_ctx - first - 1;
|
||||
|
||||
// perplexity is e^(average negative log-likelihood)
|
||||
@@ -283,16 +385,19 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
fflush(stdout);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
nll2 /= count;
|
||||
nll /= count;
|
||||
const double ppl = exp(nll);
|
||||
nll2 -= nll * nll;
|
||||
if (nll2 > 0) {
|
||||
nll2 = sqrt(nll2/(count-1));
|
||||
double ppl = exp(nll);
|
||||
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
||||
} else {
|
||||
printf("Unexpected negative standard deviation of log(prob)\n");
|
||||
}
|
||||
|
||||
return {tokens, ppl, logit_history, prob_history};
|
||||
}
|
||||
|
||||
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
|
||||
@@ -392,7 +497,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
hs_data[i].context = prompt_lines[idx*6];
|
||||
hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
||||
for (size_t j=0; j < 4; j++) {
|
||||
hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j];
|
||||
hs_data[i].ending[j] = prompt_lines[idx*6+2+j];
|
||||
}
|
||||
|
||||
// Delete the selected random example from the prompt
|
||||
@@ -417,7 +522,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
size_t context_size = context_embd.size();
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[i], add_bos);
|
||||
ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos);
|
||||
for (int k = 0; k < int(context_size); ++k) {
|
||||
if (ending_tokens[i][k] != context_embd[k]) {
|
||||
fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k);
|
||||
@@ -563,11 +668,6 @@ int main(int argc, char ** argv) {
|
||||
params.n_ctx += params.ppl_stride/2;
|
||||
}
|
||||
|
||||
if (params.n_ctx > 2048) {
|
||||
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
||||
"expect poor results\n", __func__, params.n_ctx);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
@@ -593,6 +693,11 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.n_ctx > llama_n_ctx(ctx)) {
|
||||
fprintf(stderr, "%s: warning: model might not support context sizes greater than %d tokens (%d specified);"
|
||||
"expect poor results\n", __func__, llama_n_ctx(ctx), params.n_ctx);
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
@@ -600,13 +705,16 @@ int main(int argc, char ** argv) {
|
||||
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
||||
}
|
||||
|
||||
struct results_perplexity results;
|
||||
if (params.hellaswag) {
|
||||
hellaswag_score(ctx, params);
|
||||
} else {
|
||||
perplexity(ctx, params);
|
||||
results = perplexity(ctx, params);
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
write_logfile(ctx, params, model, results);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
|
||||
@@ -35,6 +35,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
||||
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||
};
|
||||
|
||||
|
||||
@@ -71,12 +73,17 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
|
||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||
//
|
||||
void usage(const char * executable) {
|
||||
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
fprintf(stderr, "\nAllowed quantization types:\n");
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
printf("\nAllowed quantization types:\n");
|
||||
for (auto & it : QUANT_OPTIONS) {
|
||||
printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
|
||||
if (it.name != "COPY") {
|
||||
printf(" %2d or ", it.ftype);
|
||||
} else {
|
||||
printf(" ");
|
||||
}
|
||||
printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
|
||||
}
|
||||
exit(1);
|
||||
}
|
||||
@@ -100,7 +107,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
if (argc - arg_idx < 3) {
|
||||
if (argc - arg_idx < 2) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
||||
@@ -114,13 +121,16 @@ int main(int argc, char ** argv) {
|
||||
std::string ftype_str;
|
||||
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||
std::string fpath;
|
||||
const size_t pos = fname_inp.find_last_of('/');
|
||||
const size_t pos = fname_inp.find_last_of("/\\");
|
||||
if (pos != std::string::npos) {
|
||||
fpath = fname_inp.substr(0, pos + 1);
|
||||
}
|
||||
// export as [inp path]/ggml-model-[ftype].gguf
|
||||
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
|
||||
arg_idx++;
|
||||
if (ftype_str == "COPY") {
|
||||
params.only_copy = true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
fname_out = argv[arg_idx];
|
||||
@@ -133,6 +143,10 @@ int main(int argc, char ** argv) {
|
||||
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
|
||||
return 1;
|
||||
} else {
|
||||
if (ftype_str == "COPY") {
|
||||
params.only_copy = true;
|
||||
}
|
||||
}
|
||||
arg_idx++;
|
||||
}
|
||||
|
||||
@@ -87,7 +87,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
auto next_token = llama_sample_token(ctx, &candidates_p);
|
||||
auto next_token_str = llama_token_to_str(ctx, next_token);
|
||||
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
||||
last_n_tokens_data.push_back(next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
auto next_token = llama_sample_token(ctx2, &candidates_p);
|
||||
auto next_token_str = llama_token_to_str(ctx2, next_token);
|
||||
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
||||
last_n_tokens_data.push_back(next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
|
||||
@@ -164,6 +164,12 @@ node index.js
|
||||
|
||||
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
|
||||
|
||||
- **POST** `/detokenize`: Convert tokens to text.
|
||||
|
||||
*Options:*
|
||||
|
||||
`tokens`: Set the tokens to detokenize.
|
||||
|
||||
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
|
||||
|
||||
*Options:*
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -145,7 +145,29 @@
|
||||
color: #888;
|
||||
}
|
||||
|
||||
|
||||
@keyframes loading-bg-wipe {
|
||||
0% {
|
||||
background-position: 0%;
|
||||
}
|
||||
100% {
|
||||
background-position: 100%;
|
||||
}
|
||||
}
|
||||
|
||||
.loading {
|
||||
--loading-color-1: #eeeeee00;
|
||||
--loading-color-2: #eeeeeeff;
|
||||
background-size: 50% 100%;
|
||||
background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
|
||||
animation: loading-bg-wipe 2s linear infinite;
|
||||
}
|
||||
|
||||
@media (prefers-color-scheme: dark) {
|
||||
.loading {
|
||||
--loading-color-1: #22222200;
|
||||
--loading-color-2: #222222ff;
|
||||
}
|
||||
.popover-content {
|
||||
background-color: black;
|
||||
}
|
||||
@@ -321,7 +343,10 @@
|
||||
const llamaStats = signal(null)
|
||||
const controller = signal(null)
|
||||
|
||||
const generating = computed(() => controller.value == null )
|
||||
// currently generating a completion?
|
||||
const generating = computed(() => controller.value != null)
|
||||
|
||||
// has the user started a chat?
|
||||
const chatStarted = computed(() => session.value.transcript.length > 0)
|
||||
|
||||
const transcriptUpdate = (transcript) => {
|
||||
@@ -430,11 +455,19 @@
|
||||
return html`
|
||||
<form onsubmit=${submit}>
|
||||
<div>
|
||||
<textarea type="text" rows=2 onkeypress=${enterSubmits} value="${message}" oninput=${(e) => message.value = e.target.value} placeholder="Say something..."/>
|
||||
<textarea
|
||||
className=${generating.value ? "loading" : null}
|
||||
oninput=${(e) => message.value = e.target.value}
|
||||
onkeypress=${enterSubmits}
|
||||
placeholder="Say something..."
|
||||
rows=2
|
||||
type="text"
|
||||
value="${message}"
|
||||
/>
|
||||
</div>
|
||||
<div class="right">
|
||||
<button type="submit" disabled=${!generating.value} >Send</button>
|
||||
<button onclick=${stop} disabled=${generating}>Stop</button>
|
||||
<button type="submit" disabled=${generating.value}>Send</button>
|
||||
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
||||
<button onclick=${reset}>Reset</button>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
@@ -17,6 +17,8 @@
|
||||
#include "completion.js.hpp"
|
||||
#include "json-schema-to-grammar.mjs.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
#ifndef SERVER_VERBOSE
|
||||
#define SERVER_VERBOSE 1
|
||||
#endif
|
||||
@@ -94,7 +96,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
||||
std::string ret;
|
||||
for (; begin != end; ++begin)
|
||||
{
|
||||
ret += llama_token_to_str(ctx, *begin);
|
||||
ret += llama_token_to_piece(ctx, *begin);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@@ -123,7 +125,7 @@ static void server_log(const char *level, const char *function, int line,
|
||||
// format incomplete utf-8 multibyte character for output
|
||||
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
||||
{
|
||||
std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
|
||||
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
|
||||
// if the size is 1 and first bit is 1, meaning it's a partial character
|
||||
// (size > 1 meaning it's already a known token)
|
||||
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
||||
@@ -286,7 +288,6 @@ struct llama_server_context
|
||||
std::vector<llama_token> p;
|
||||
if (first)
|
||||
{
|
||||
s.insert(0, 1, ' '); // add a space if it's the first
|
||||
p = ::llama_tokenize(ctx, s, add_bos);
|
||||
first = false;
|
||||
}
|
||||
@@ -309,7 +310,6 @@ struct llama_server_context
|
||||
else
|
||||
{
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
s.insert(0, 1, ' '); // always add a first space
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
|
||||
}
|
||||
|
||||
@@ -566,7 +566,7 @@ struct llama_server_context
|
||||
|
||||
if (!embd.empty() && embd.back() == llama_token_eos(ctx))
|
||||
{
|
||||
// stopping_word = llama_token_to_str(ctx, embd.back());
|
||||
// stopping_word = llama_token_to_piece(ctx, embd.back());
|
||||
has_next_token = false;
|
||||
stopped_eos = true;
|
||||
LOG_VERBOSE("eos token found", {});
|
||||
@@ -613,7 +613,7 @@ struct llama_server_context
|
||||
{
|
||||
const completion_token_output token_with_probs = nextToken();
|
||||
|
||||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok);
|
||||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
|
||||
generated_text += token_text;
|
||||
|
||||
if (params.n_probs > 0)
|
||||
@@ -721,7 +721,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
|
||||
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
||||
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
||||
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
|
||||
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
|
||||
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
|
||||
@@ -1040,7 +1040,7 @@ static json format_timings(llama_server_context &llama)
|
||||
{
|
||||
const auto timings = llama_get_timings(llama.ctx);
|
||||
|
||||
assert(timings.n_eval == llama.num_tokens_predicted);
|
||||
assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
|
||||
|
||||
return json{
|
||||
{"prompt_n", timings.n_p_eval},
|
||||
@@ -1104,6 +1104,12 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
|
||||
{"tokens", tokens}};
|
||||
}
|
||||
|
||||
static json format_detokenized_response(std::string content)
|
||||
{
|
||||
return json{
|
||||
{"content", content}};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static T json_value(const json &body, const std::string &key, const T &default_value)
|
||||
{
|
||||
@@ -1235,7 +1241,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
|
||||
const llama_token * tokens = beams_state.beam_views[0].tokens;
|
||||
const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
|
||||
std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
|
||||
printf("%lu", n);
|
||||
printf("%zu", n);
|
||||
}
|
||||
fflush(stdout);
|
||||
#if 0 // DEBUG: print current beams for this iteration
|
||||
@@ -1248,7 +1254,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
|
||||
|
||||
struct token_translator {
|
||||
llama_context * ctx;
|
||||
std::string operator()(llama_token tok) const { return llama_token_to_str(ctx, tok); }
|
||||
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
|
||||
std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
|
||||
};
|
||||
|
||||
@@ -1358,7 +1364,7 @@ int main(int argc, char **argv)
|
||||
|
||||
while (llama.has_next_token) {
|
||||
const completion_token_output token_with_probs = llama.doCompletion();
|
||||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
|
||||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok);
|
||||
|
||||
stop_pos = llama.findStoppingStrings(llama.generated_text,
|
||||
token_text.size(), STOP_FULL);
|
||||
@@ -1373,7 +1379,13 @@ int main(int argc, char **argv)
|
||||
}
|
||||
}
|
||||
|
||||
const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
|
||||
auto probs = llama.generated_token_probs;
|
||||
if (llama.params.n_probs > 0 && llama.stopped_word) {
|
||||
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
|
||||
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
|
||||
}
|
||||
|
||||
const json data = format_final_response(llama, llama.generated_text, probs);
|
||||
|
||||
llama_print_timings(llama.ctx);
|
||||
|
||||
@@ -1389,7 +1401,7 @@ int main(int argc, char **argv)
|
||||
if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
|
||||
continue;
|
||||
}
|
||||
const std::string token_text = llama_token_to_str(llama.ctx, token_with_probs.tok);
|
||||
const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
|
||||
|
||||
size_t pos = std::min(sent_count, llama.generated_text.size());
|
||||
|
||||
@@ -1450,7 +1462,11 @@ int main(int argc, char **argv)
|
||||
|
||||
if (!llama.has_next_token) {
|
||||
// Generation is done, send extra information.
|
||||
const json data = format_final_response(llama, "", llama.generated_token_probs);
|
||||
const json data = format_final_response(
|
||||
llama,
|
||||
"",
|
||||
std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index)
|
||||
);
|
||||
|
||||
const std::string str =
|
||||
"data: " +
|
||||
@@ -1501,6 +1517,21 @@ int main(int argc, char **argv)
|
||||
const json data = format_tokenizer_response(tokens);
|
||||
return res.set_content(data.dump(), "application/json"); });
|
||||
|
||||
svr.Post("/detokenize", [&llama](const Request &req, Response &res)
|
||||
{
|
||||
auto lock = llama.lock();
|
||||
|
||||
const json body = json::parse(req.body);
|
||||
std::string content;
|
||||
if (body.count("tokens") != 0)
|
||||
{
|
||||
const std::vector<llama_token> tokens = body["tokens"];
|
||||
content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
|
||||
}
|
||||
|
||||
const json data = format_detokenized_response(content);
|
||||
return res.set_content(data.dump(), "application/json"); });
|
||||
|
||||
svr.Post("/embedding", [&llama](const Request &req, Response &res)
|
||||
{
|
||||
auto lock = llama.lock();
|
||||
@@ -1529,7 +1560,7 @@ int main(int argc, char **argv)
|
||||
|
||||
svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep)
|
||||
{
|
||||
const auto * fmt = "500 Internal Server Error\n%s";
|
||||
const char fmt[] = "500 Internal Server Error\n%s";
|
||||
char buf[BUFSIZ];
|
||||
try {
|
||||
std::rethrow_exception(std::move(ep));
|
||||
|
||||
@@ -63,7 +63,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
for (auto id : tokens_list) {
|
||||
fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
@@ -112,7 +112,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// print the new token :
|
||||
printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
|
||||
printf("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||
fflush(stdout);
|
||||
|
||||
// push this new token for next evaluation
|
||||
|
||||
8
examples/speculative/CMakeLists.txt
Normal file
8
examples/speculative/CMakeLists.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
set(TARGET speculative)
|
||||
add_executable(${TARGET} speculative.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
234
examples/speculative/speculative.cpp
Normal file
234
examples/speculative/speculative.cpp
Normal file
@@ -0,0 +1,234 @@
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
|
||||
#include "build-info.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.model_draft.empty()) {
|
||||
fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("speculative", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init(params.numa);
|
||||
|
||||
llama_model * model_tgt = NULL;
|
||||
llama_model * model_dft = NULL;
|
||||
|
||||
llama_context * ctx_tgt = NULL;
|
||||
llama_context * ctx_dft = NULL;
|
||||
|
||||
// load the target model
|
||||
params.perplexity = true; // HACK: enable logits_all = true
|
||||
std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
|
||||
|
||||
// load the draft model
|
||||
params.model = params.model_draft;
|
||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
|
||||
|
||||
const int max_context_size = llama_n_ctx(ctx_tgt);
|
||||
const int max_tokens_list_size = max_context_size - 4;
|
||||
|
||||
if ((int) inp.size() > max_tokens_list_size) {
|
||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
for (auto id : inp) {
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
|
||||
const int n_input = inp.size();
|
||||
|
||||
const auto t_enc_start = ggml_time_us();
|
||||
|
||||
// eval the prompt with both models
|
||||
llama_eval(ctx_tgt, inp.data(), int(inp.size() - 1), 0, params.n_threads);
|
||||
llama_eval(ctx_tgt, &inp.back(), 1, inp.size() - 1, params.n_threads);
|
||||
llama_eval(ctx_dft, inp.data(), int(inp.size()), 0, params.n_threads);
|
||||
|
||||
const auto t_enc_end = ggml_time_us();
|
||||
|
||||
// the 2 models should have the same vocab
|
||||
const int n_ctx = llama_n_ctx(ctx_tgt);
|
||||
const int n_vocab = llama_n_vocab(ctx_tgt);
|
||||
//GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
|
||||
|
||||
// how many tokens to draft each time
|
||||
const int n_draft = params.n_draft;
|
||||
|
||||
int n_predict = 0;
|
||||
int n_drafted = 0;
|
||||
int n_accept = 0;
|
||||
|
||||
int n_past_tgt = inp.size();
|
||||
int n_past_dft = inp.size();
|
||||
|
||||
std::vector<llama_token> drafted;
|
||||
|
||||
std::vector<llama_token> last_tokens(n_ctx);
|
||||
std::fill(last_tokens.begin(), last_tokens.end(), 0);
|
||||
|
||||
for (auto & id : inp) {
|
||||
last_tokens.erase(last_tokens.begin());
|
||||
last_tokens.push_back(id);
|
||||
}
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
|
||||
// used to determine end of generation
|
||||
bool has_eos = false;
|
||||
|
||||
const auto t_dec_start = ggml_time_us();
|
||||
|
||||
while (true) {
|
||||
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
|
||||
|
||||
// sample from the drafted tokens if any
|
||||
int i_dft = 0;
|
||||
while (true) {
|
||||
const llama_token id = llama_sample_token(ctx_tgt, NULL, NULL, params, last_tokens, candidates, i_dft);
|
||||
|
||||
last_tokens.erase(last_tokens.begin());
|
||||
last_tokens.push_back(id);
|
||||
|
||||
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));
|
||||
|
||||
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
|
||||
printf("%s", token_str.c_str());
|
||||
fflush(stdout);
|
||||
|
||||
if (id == llama_token_eos(ctx_tgt)) {
|
||||
has_eos = true;
|
||||
}
|
||||
|
||||
++n_predict;
|
||||
|
||||
if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
|
||||
LOG("drafted token %d accepted\n", id);
|
||||
++n_accept;
|
||||
++n_past_tgt;
|
||||
++n_past_dft;
|
||||
++i_dft;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// the drafted token was rejected or we are out of drafted tokens
|
||||
llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
|
||||
++n_past_dft;
|
||||
|
||||
drafted.clear();
|
||||
drafted.push_back(id);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if (n_predict > params.n_predict || has_eos) {
|
||||
break;
|
||||
}
|
||||
|
||||
// sample n_draft tokens from the draft model picking the best token
|
||||
int n_past_cur = n_past_dft;
|
||||
for (int i = 0; i < n_draft; ++i) {
|
||||
float * logits = llama_get_logits(ctx_dft);
|
||||
|
||||
candidates.clear();
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
// computes softmax and sorts the candidates
|
||||
llama_sample_softmax(ctx_dft, &cur_p);
|
||||
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
LOG(" - draft candidate %d: %d (%.3f)\n", i, cur_p.data[i].id, cur_p.data[i].p);
|
||||
}
|
||||
|
||||
// too low probability, stop drafting
|
||||
if (cur_p.data[0].p < 2*cur_p.data[1].p) {
|
||||
break;
|
||||
}
|
||||
|
||||
drafted.push_back(cur_p.data[0].id);
|
||||
++n_drafted;
|
||||
|
||||
if (i < n_draft - 1) {
|
||||
// evaluate the drafted token on the draft model
|
||||
llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads);
|
||||
++n_past_cur;
|
||||
}
|
||||
}
|
||||
|
||||
// evaluate the target model on the drafted tokens
|
||||
llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads);
|
||||
++n_past_tgt;
|
||||
|
||||
drafted.erase(drafted.begin());
|
||||
}
|
||||
|
||||
auto t_dec_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||
|
||||
// TODO: make sure these numbers are computed correctly
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("n_draft = %d\n", n_draft);
|
||||
LOG_TEE("n_predict = %d\n", n_predict);
|
||||
LOG_TEE("n_drafted = %d\n", n_drafted);
|
||||
LOG_TEE("n_accept = %d\n", n_accept);
|
||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||
|
||||
LOG_TEE("\ndraft:\n");
|
||||
llama_print_timings(ctx_dft);
|
||||
|
||||
LOG_TEE("\ntarget:\n");
|
||||
llama_print_timings(ctx_tgt);
|
||||
|
||||
llama_free(ctx_tgt);
|
||||
llama_free_model(model_tgt);
|
||||
|
||||
llama_free(ctx_dft);
|
||||
llama_free_model(model_dft);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -8,15 +8,15 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
||||
|
||||
# train
|
||||
./bin/train-text-from-scratch \
|
||||
--vocab-model ../models/ggml-vocab.bin \
|
||||
--vocab-model ../models/ggml-vocab-llama.gguf \
|
||||
--ctx 64 --embd 256 --head 8 --layer 16 \
|
||||
--checkpoint-in chk-shakespeare-256x16.bin \
|
||||
--checkpoint-out chk-shakespeare-256x16.bin \
|
||||
--model-out ggml-shakespeare-256x16-f32.bin \
|
||||
--checkpoint-in chk-shakespeare-256x16.gguf \
|
||||
--checkpoint-out chk-shakespeare-256x16.gguf \
|
||||
--model-out ggml-shakespeare-256x16-f32.gguf \
|
||||
--train-data "shakespeare.txt" \
|
||||
-t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
|
||||
--print-details-interval 0 --predict 16 --use-flash
|
||||
-t 6 -b 16 --seed 1 --adam-iter 256 \
|
||||
--no-checkpointing
|
||||
|
||||
# predict
|
||||
./bin/main -m ggml-shakespeare-256x16-f32.bin
|
||||
./bin/main -m ggml-shakespeare-256x16-f32.gguf
|
||||
```
|
||||
|
||||
@@ -0,0 +1,495 @@
|
||||
#!/usr/bin/env python3
|
||||
# train-text-from-scratch checkpoint --> gguf conversion
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
# gguf constants
|
||||
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
|
||||
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
|
||||
LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
|
||||
LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
|
||||
LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
|
||||
LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
|
||||
LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
|
||||
LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
|
||||
LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
|
||||
LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
|
||||
LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
|
||||
LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
|
||||
LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
|
||||
LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
|
||||
|
||||
LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
|
||||
LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
|
||||
LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
|
||||
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
|
||||
|
||||
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
|
||||
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
|
||||
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
|
||||
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
|
||||
|
||||
class Tensor:
|
||||
def __init__(self, dtype='f', ne=None):
|
||||
if ne is None:
|
||||
ne = []
|
||||
self.dtype = dtype
|
||||
self.ne = ne
|
||||
self.nbytes = 0
|
||||
if self.dtype == 'f':
|
||||
if len(self.ne) == 0:
|
||||
self.nbytes = 0
|
||||
else:
|
||||
self.nbytes = int(np.product(self.ne)) * 4
|
||||
else:
|
||||
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
||||
|
||||
def load(self, data, offset):
|
||||
nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
assert(nd == len(self.ne))
|
||||
ne = []
|
||||
for d in range(nd):
|
||||
n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
ne.append(n)
|
||||
|
||||
assert(tuple(ne) == tuple(self.ne))
|
||||
|
||||
if self.dtype == 'f':
|
||||
assert(dtype == 0)
|
||||
else:
|
||||
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
||||
|
||||
self.name = bytes(data[offset:offset+namelen]); offset += namelen
|
||||
# 32-byte alignment
|
||||
offset += (0 - offset) & 31
|
||||
self.data = data[offset:offset+self.nbytes]
|
||||
offset += self.nbytes
|
||||
return offset
|
||||
|
||||
def max_storage_size(self):
|
||||
result = 0
|
||||
result += 4 # nd
|
||||
result += 4 # namelen
|
||||
result += 4 # dtype
|
||||
result += len(self.ne)*8 # ne
|
||||
result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
|
||||
result += 31 # 32-byte alignment
|
||||
result += self.nbytes
|
||||
return result
|
||||
|
||||
def save_gguf(self, gguf_writer, name):
|
||||
gguf_writer.add_tensor(
|
||||
name=name,
|
||||
tensor=self.data,
|
||||
raw_shape=np.array(list(reversed(self.ne))),
|
||||
raw_dtype=gguf.GGMLQuantizationType.F32)
|
||||
|
||||
class OptimizationParamsV0:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def load(self, data, offset):
|
||||
self.type = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_threads = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.delta = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.print_forward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
|
||||
self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
|
||||
self.adam_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_sched = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_decay = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_alpha = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_beta1 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_beta2 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_eps_f = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_eps_g = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_max_linesearch = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_ftol = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_wolfe = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_min_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_max_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_linesearch = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
return offset
|
||||
|
||||
class OptimizationContext:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def load(self, data, offset):
|
||||
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
|
||||
offset += 4
|
||||
|
||||
if self.version == 0:
|
||||
params = OptimizationParamsV0()
|
||||
offset = params.load(data, offset)
|
||||
self.past = params.past
|
||||
self.lbfgs_m = params.lbfgs_m
|
||||
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
||||
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
||||
self.type = params.type
|
||||
|
||||
self.adam_m = Tensor('f', [self.nx])
|
||||
self.adam_v = Tensor('f', [self.nx])
|
||||
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||
|
||||
self.lbfgs_x = Tensor('f', [self.nx])
|
||||
self.lbfgs_xp = Tensor('f', [self.nx])
|
||||
self.lbfgs_g = Tensor('f', [self.nx])
|
||||
self.lbfgs_gp = Tensor('f', [self.nx])
|
||||
self.lbfgs_d = Tensor('f', [self.nx])
|
||||
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
||||
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
||||
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
||||
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
|
||||
|
||||
if self.type == 0:
|
||||
# these tensors are stored, but we don't need their data
|
||||
x = Tensor('f', [self.nx])
|
||||
g = Tensor('f', [self.nx])
|
||||
g2 = Tensor('f', [self.nx])
|
||||
mh = Tensor('f', [self.nx])
|
||||
vh = Tensor('f', [self.nx])
|
||||
|
||||
offset = x.load(data, offset)
|
||||
offset = g.load(data, offset)
|
||||
offset = g2.load(data, offset)
|
||||
offset = self.adam_m.load(data, offset)
|
||||
offset = self.adam_v.load(data, offset)
|
||||
offset = mh.load(data, offset)
|
||||
offset = vh.load(data, offset)
|
||||
offset = self.adam_pf.load(data, offset)
|
||||
|
||||
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
elif self.type == 1:
|
||||
offset = self.lbfgs_x.load(data, offset)
|
||||
offset = self.lbfgs_xp.load(data, offset)
|
||||
offset = self.lbfgs_g.load(data, offset)
|
||||
offset = self.lbfgs_gp.load(data, offset)
|
||||
offset = self.lbfgs_d.load(data, offset)
|
||||
offset = self.lbfgs_pf.load(data, offset)
|
||||
offset = self.lbfgs_lmal.load(data, offset)
|
||||
offset = self.lbfgs_lmys.load(data, offset)
|
||||
offset = self.lbfgs_lms.load(data, offset)
|
||||
offset = self.lbfgs_lmy.load(data, offset)
|
||||
|
||||
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
else:
|
||||
raise ValueError('Unknown optimizer type')
|
||||
|
||||
|
||||
elif self.version == 1:
|
||||
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
||||
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
||||
|
||||
self.adam_m = Tensor('f', [self.nx])
|
||||
self.adam_v = Tensor('f', [self.nx])
|
||||
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||
|
||||
self.lbfgs_x = Tensor('f', [self.nx])
|
||||
self.lbfgs_xp = Tensor('f', [self.nx])
|
||||
self.lbfgs_g = Tensor('f', [self.nx])
|
||||
self.lbfgs_gp = Tensor('f', [self.nx])
|
||||
self.lbfgs_d = Tensor('f', [self.nx])
|
||||
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
||||
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
||||
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
||||
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
|
||||
|
||||
# forgot to save type in version 1:
|
||||
# guess self.type from number of remaining bytes
|
||||
size_type_0 = 12 + sum([t.max_storage_size() for t in
|
||||
[self.adam_m, self.adam_v]
|
||||
+([self.adam_pf] if (self.past > 0) else [])])
|
||||
size_type_1 = 24 + sum([t.max_storage_size() for t in
|
||||
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
|
||||
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
|
||||
self.lbfgs_lmal, self.lbfgs_lmys,
|
||||
self.lbfgs_lms, self.lbfgs_lmy]
|
||||
+([self.lbfgs_pf] if (self.past > 0) else [])])
|
||||
# due to alignment padding the size might not by exact
|
||||
# but the difference in size for both types is significant,
|
||||
# so we can just use whichever is closest
|
||||
remaining = len(data) - offset
|
||||
if abs(remaining - size_type_0) < abs(remaining - size_type_1):
|
||||
self.type = 0
|
||||
else:
|
||||
self.type = 1
|
||||
|
||||
if self.type == 0:
|
||||
offset = self.adam_m.load(data, offset)
|
||||
offset = self.adam_v.load(data, offset)
|
||||
offset = self.adam_pf.load(data,offset)
|
||||
|
||||
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
elif self.type == 1:
|
||||
offset = self.lbfgs_x.load(data, offset)
|
||||
offset = self.lbfgs_xp.load(data, offset)
|
||||
offset = self.lbfgs_g.load(data, offset)
|
||||
offset = self.lbfgs_gp.load(data, offset)
|
||||
offset = self.lbfgs_d.load(data, offset)
|
||||
offset = self.lbfgs_pf.load(data, offset)
|
||||
offset = self.lbfgs_lmal.load(data, offset)
|
||||
offset = self.lbfgs_lmys.load(data, offset)
|
||||
offset = self.lbfgs_lms.load(data, offset)
|
||||
offset = self.lbfgs_lmy.load(data, offset)
|
||||
|
||||
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
else:
|
||||
raise ValueError('Invalid version of checkpoint file')
|
||||
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
|
||||
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
|
||||
gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
|
||||
|
||||
if self.type == 0:
|
||||
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
|
||||
|
||||
self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
|
||||
self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
|
||||
if self.past > 0:
|
||||
self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
|
||||
|
||||
elif self.type == 1:
|
||||
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
|
||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
|
||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
|
||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
|
||||
|
||||
self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
|
||||
self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
|
||||
self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
|
||||
self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
|
||||
self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
|
||||
if self.past > 0:
|
||||
self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
|
||||
self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
|
||||
self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
|
||||
self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
|
||||
self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
|
||||
else:
|
||||
raise ValueError('Unknown optimizer type')
|
||||
|
||||
class ModelParams:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def load(self, data, offset):
|
||||
self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
return offset
|
||||
|
||||
def get_n_ff(self):
|
||||
# struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
|
||||
return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
# self.n_vocab not saved
|
||||
gguf_writer.add_embedding_length(self.n_embd)
|
||||
gguf_writer.add_head_count(self.n_head)
|
||||
gguf_writer.add_block_count(self.n_layer)
|
||||
gguf_writer.add_rope_dimension_count(self.n_rot)
|
||||
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
||||
|
||||
def tensor_name(key, bid=None):
|
||||
return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + ".weight"
|
||||
|
||||
class Layer:
|
||||
def __init__(self, params, bid):
|
||||
self.bid = bid
|
||||
self.att_norm = Tensor('f', [params.n_embd])
|
||||
self.wq = Tensor('f', [params.n_embd, params.n_embd])
|
||||
self.wk = Tensor('f', [params.n_embd, params.n_embd])
|
||||
self.wv = Tensor('f', [params.n_embd, params.n_embd])
|
||||
self.wo = Tensor('f', [params.n_embd, params.n_embd])
|
||||
self.ffn_norm = Tensor('f', [params.n_embd])
|
||||
self.w1 = Tensor('f', [params.n_embd, params.get_n_ff()])
|
||||
self.w2 = Tensor('f', [params.get_n_ff(), params.n_embd])
|
||||
self.w3 = Tensor('f', [params.n_embd, params.get_n_ff()])
|
||||
|
||||
def load(self, data, offset):
|
||||
offset = self.att_norm.load(data, offset)
|
||||
offset = self.wq.load(data, offset)
|
||||
offset = self.wk.load(data, offset)
|
||||
offset = self.wv.load(data, offset)
|
||||
offset = self.wo.load(data, offset)
|
||||
offset = self.ffn_norm.load(data, offset)
|
||||
offset = self.w1.load(data, offset)
|
||||
offset = self.w2.load(data, offset)
|
||||
offset = self.w3.load(data, offset)
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
|
||||
self.wq.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid))
|
||||
self.wk.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid))
|
||||
self.wv.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid))
|
||||
self.wo.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid))
|
||||
self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid))
|
||||
self.w1.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid))
|
||||
self.w2.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid))
|
||||
self.w3.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid))
|
||||
|
||||
class Model:
|
||||
def __init__(self):
|
||||
self.params = ModelParams()
|
||||
self.layers = []
|
||||
|
||||
def load(self, data, offset):
|
||||
offset = self.params.load(data, offset)
|
||||
|
||||
self.tok_embd = Tensor('f', [self.params.n_embd, self.params.n_vocab])
|
||||
self.norm = Tensor('f', [self.params.n_embd])
|
||||
self.output = Tensor('f', [self.params.n_embd, self.params.n_vocab])
|
||||
|
||||
offset = self.tok_embd.load(data, offset)
|
||||
offset = self.norm.load(data, offset)
|
||||
offset = self.output.load(data, offset)
|
||||
|
||||
self.layers.clear()
|
||||
for bid in range(self.params.n_layer):
|
||||
layer = Layer(self.params, bid)
|
||||
offset = layer.load(data, offset)
|
||||
self.layers.append(layer)
|
||||
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
self.params.save_gguf(gguf_writer)
|
||||
|
||||
self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
|
||||
self.norm.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
|
||||
self.output.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
|
||||
|
||||
for layer in self.layers:
|
||||
layer.save_gguf(gguf_writer)
|
||||
|
||||
class Checkpoint:
|
||||
def __init__(self):
|
||||
self.model = Model()
|
||||
self.opt_ctx = OptimizationContext()
|
||||
|
||||
def load(self, data, offset):
|
||||
magic = bytes(reversed(data[offset:offset + 4])); offset += 4
|
||||
if magic != b'ggcp':
|
||||
raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
|
||||
|
||||
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
if self.version != 0:
|
||||
raise ValueError('Invalid version of checkpoint file')
|
||||
|
||||
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
offset = self.model.load(data, offset)
|
||||
offset = self.opt_ctx.load(data, offset)
|
||||
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
|
||||
gguf_writer.add_layer_norm_rms_eps(1e-5)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
|
||||
self.model.save_gguf(gguf_writer)
|
||||
self.opt_ctx.save_gguf(gguf_writer)
|
||||
|
||||
def handle_args():
|
||||
parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
|
||||
parser.add_argument('--input', '-i', type = Path, help = 'Input train checkpoint filename', required=True)
|
||||
parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
cfg = handle_args()
|
||||
data = np.memmap(cfg.input, mode = 'r')
|
||||
chk = Checkpoint()
|
||||
offset = 0
|
||||
offset = chk.load(data, offset)
|
||||
# we should have read all available data
|
||||
assert(offset == len(data))
|
||||
|
||||
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
||||
chk.save_gguf(gguf_writer)
|
||||
print(" gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print(" gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
print(" gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
gguf_writer.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
File diff suppressed because it is too large
Load Diff
183
ggml-alloc.c
183
ggml-alloc.c
@@ -1,3 +1,8 @@
|
||||
// defines MAP_ANONYMOUS
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml.h"
|
||||
#include <assert.h>
|
||||
@@ -6,6 +11,26 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __has_include
|
||||
#if __has_include(<unistd.h>)
|
||||
#include <unistd.h>
|
||||
#if defined(_POSIX_MAPPED_FILES)
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <memoryapi.h>
|
||||
#endif
|
||||
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
||||
@@ -99,15 +124,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
return ggml_nbytes(tensor);
|
||||
|
||||
UNUSED(alloc);
|
||||
}
|
||||
|
||||
// check if a tensor is allocated by this buffer
|
||||
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
||||
void * ptr = tensor->data;
|
||||
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
||||
}
|
||||
|
||||
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
|
||||
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
||||
#endif
|
||||
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
|
||||
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
||||
@@ -173,17 +207,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
||||
}
|
||||
|
||||
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
||||
static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
void * ptr = tensor->data;
|
||||
|
||||
if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
|
||||
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
||||
// the tensor was not allocated in this buffer
|
||||
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
||||
// the easiest way to deal with this is just to ignore it
|
||||
return;
|
||||
}
|
||||
|
||||
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
|
||||
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
|
||||
|
||||
@@ -268,7 +302,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
||||
/*.parse_seq = */ {0},
|
||||
/*.parse_seq_len = */ 0,
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
/*.allocated_tensors = */ = {0},
|
||||
/*.allocated_tensors = */ {0},
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -277,17 +311,64 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
||||
return alloc;
|
||||
}
|
||||
|
||||
// address and size of the buffer when measuring
|
||||
// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
|
||||
static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
|
||||
static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
|
||||
// OS specific functions to allocate and free uncommitted virtual memory
|
||||
static void * alloc_vmem(size_t size) {
|
||||
#if defined(_WIN32)
|
||||
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
|
||||
#elif defined(_POSIX_MAPPED_FILES)
|
||||
return mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
||||
#else
|
||||
// use a fixed address for other platforms
|
||||
uintptr_t base_addr = (uintptr_t)-size - 0x100;
|
||||
return (void *)base_addr;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void free_vmem(void * base_addr, size_t size) {
|
||||
#if defined(_WIN32)
|
||||
VirtualFree(base_addr, 0, MEM_RELEASE);
|
||||
UNUSED(size);
|
||||
#elif defined(_POSIX_MAPPED_FILES)
|
||||
munmap(base_addr, size);
|
||||
#else
|
||||
// nothing to do
|
||||
UNUSED(base_addr);
|
||||
UNUSED(size);
|
||||
#endif
|
||||
}
|
||||
|
||||
// allocate uncommitted virtual memory to measure the size of the graph
|
||||
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
||||
// 1TB for 64-bit, 1GB for 32-bit
|
||||
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
|
||||
do {
|
||||
*base_addr = alloc_vmem(*size);
|
||||
if (*base_addr != NULL) {
|
||||
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
|
||||
return;
|
||||
}
|
||||
// try again with half the size
|
||||
*size /= 2;
|
||||
} while (*size > 0);
|
||||
|
||||
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
|
||||
}
|
||||
|
||||
static void free_measure_vmem(void * base_addr, size_t size) {
|
||||
free_vmem(base_addr, size);
|
||||
}
|
||||
|
||||
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
||||
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
||||
|
||||
void * base_addr;
|
||||
size_t size;
|
||||
|
||||
alloc_measure_vmem(&base_addr, &size);
|
||||
|
||||
*alloc = (struct ggml_allocr){
|
||||
/*.data = */ MEASURE_BASE_ADDR,
|
||||
/*.size = */ MEASURE_MAX_SIZE,
|
||||
/*.data = */ base_addr,
|
||||
/*.size = */ size,
|
||||
/*.alignment = */ alignment,
|
||||
/*.n_free_blocks = */ 0,
|
||||
/*.free_blocks = */ {{0}},
|
||||
@@ -297,7 +378,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
||||
/*.parse_seq = */ {0},
|
||||
/*.parse_seq_len = */ 0,
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
/*.allocated_tensors = */ = {0},
|
||||
/*.allocated_tensors = */ {0},
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -307,6 +388,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
||||
}
|
||||
|
||||
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
||||
if (alloc->measure) {
|
||||
free_measure_vmem(alloc->data, alloc->size);
|
||||
}
|
||||
free(alloc);
|
||||
}
|
||||
|
||||
@@ -317,8 +401,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
||||
//////////// compute graph allocator
|
||||
|
||||
static bool ggml_is_view(struct ggml_tensor * t) {
|
||||
return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
|
||||
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
|
||||
return t->view_src != NULL;
|
||||
}
|
||||
|
||||
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||
@@ -336,28 +419,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
|
||||
return true;
|
||||
}
|
||||
|
||||
static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
|
||||
switch (t->op) {
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_VIEW:
|
||||
return t->src[0];
|
||||
case GGML_OP_CPY:
|
||||
return t->src[1];
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
|
||||
struct ggml_tensor * parent = t;
|
||||
do {
|
||||
parent = get_view_parent(parent);
|
||||
} while (ggml_is_view(parent));
|
||||
return parent;
|
||||
}
|
||||
|
||||
static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_SCALE:
|
||||
@@ -365,7 +426,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD1:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
@@ -375,7 +435,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
case GGML_OP_UNARY:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_SET:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_CONT:
|
||||
return true;
|
||||
@@ -389,24 +448,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
||||
struct hash_node * ht = alloc->hash_table;
|
||||
if (node->data == NULL) {
|
||||
if (ggml_is_view(node)) {
|
||||
size_t offset;
|
||||
switch(node->op) {
|
||||
case GGML_OP_VIEW:
|
||||
memcpy(&offset, node->op_params, sizeof(size_t));
|
||||
node->data = (char *) node->src[0]->data + offset;
|
||||
break;
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
node->data = node->src[0]->data;
|
||||
break;
|
||||
case GGML_OP_CPY:
|
||||
node->data = node->src[1]->data;
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(!"unknown view op");
|
||||
break;
|
||||
}
|
||||
assert(node->view_src->data != NULL);
|
||||
node->data = (char *)node->view_src->data + node->view_offs;
|
||||
} else {
|
||||
// see if we can reuse a parent's buffer (inplace)
|
||||
if (ggml_op_can_inplace(node->op)) {
|
||||
@@ -417,8 +460,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
||||
}
|
||||
|
||||
// if the node's data is external, then we cannot re-use it
|
||||
if ((char *) parent->data < (char *) alloc->data ||
|
||||
(char *) parent->data >= ((char *) alloc->data + alloc->size)) {
|
||||
if (ggml_allocr_is_own(alloc, parent) == false) {
|
||||
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
|
||||
continue;
|
||||
}
|
||||
@@ -426,7 +468,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
||||
struct hash_node * p_hn = hash_get(ht, parent);
|
||||
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
|
||||
if (ggml_is_view(parent)) {
|
||||
struct ggml_tensor * view_src = get_view_source(parent);
|
||||
struct ggml_tensor * view_src = parent->view_src;
|
||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
||||
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
||||
@@ -452,7 +494,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
||||
}
|
||||
}
|
||||
|
||||
static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
static size_t ggml_allocr_alloc_graph_tensors_n(
|
||||
struct ggml_allocr * alloc,
|
||||
struct ggml_cgraph ** graphs, int n_graphs,
|
||||
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
||||
@@ -468,7 +510,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
struct ggml_tensor * node = gf->nodes[i];
|
||||
|
||||
if (ggml_is_view(node)) {
|
||||
struct ggml_tensor * view_src = get_view_source(node);
|
||||
struct ggml_tensor * view_src = node->view_src;
|
||||
hash_get(ht, view_src)->n_views += 1;
|
||||
}
|
||||
|
||||
@@ -530,11 +572,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
AT_PRINTF("\n");
|
||||
}
|
||||
|
||||
|
||||
// update parents
|
||||
// update immediately if there is no parse_seq
|
||||
// update only at barriers if there is parse_seq
|
||||
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
||||
if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
|
||||
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
||||
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
||||
for (int i = update_start; i < update_end; i++) {
|
||||
@@ -553,17 +594,17 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
|
||||
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
||||
if (ggml_is_view(parent)) {
|
||||
struct ggml_tensor * view_src = get_view_source(parent);
|
||||
struct ggml_tensor * view_src = parent->view_src;
|
||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||
view_src_hn->n_views -= 1;
|
||||
AT_PRINTF("view_src %s\n", view_src->name);
|
||||
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
||||
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
||||
ggml_allocator_free_tensor(alloc, view_src);
|
||||
ggml_allocr_free_tensor(alloc, view_src);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (parent->data != node->data) {
|
||||
ggml_allocator_free_tensor(alloc, parent);
|
||||
ggml_allocr_free_tensor(alloc, parent);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -580,7 +621,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
for (int i = 0; outputs[g][i] != NULL; i++) {
|
||||
struct ggml_tensor * output = outputs[g][i];
|
||||
AT_PRINTF("output: %s\n", output->name);
|
||||
ggml_allocator_free_tensor(alloc, output);
|
||||
ggml_allocr_free_tensor(alloc, output);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -589,5 +630,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
}
|
||||
|
||||
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
||||
return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
||||
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
||||
}
|
||||
|
||||
140
ggml-cuda.cu
140
ggml-cuda.cu
@@ -81,12 +81,29 @@
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
#endif
|
||||
|
||||
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
||||
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||
#if __has_builtin(__builtin_elementwise_sub_sat)
|
||||
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
||||
return reinterpret_cast<const int&>(c);
|
||||
#else
|
||||
int8x4_t c;
|
||||
int16_t tmp;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; i++) {
|
||||
tmp = va[i] - vb[i];
|
||||
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
||||
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
||||
c[i] = tmp;
|
||||
}
|
||||
return reinterpret_cast<int&>(c);
|
||||
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||
@@ -306,11 +323,11 @@ typedef struct {
|
||||
#define QI4_K (QK_K / (4*QR4_K))
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
half d[2]; // super-block scales/mins
|
||||
half dm[2]; // super-block scales/mins
|
||||
uint8_t scales[2]; // 4-bit block scales/mins
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_K;
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
||||
static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
||||
#else
|
||||
typedef struct {
|
||||
half2 dm; // super-block scale for quantized scales/mins
|
||||
@@ -447,58 +464,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
||||
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
||||
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
template <int block_size>
|
||||
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
const float eps = 1e-5f;
|
||||
|
||||
float mean = 0.0f;
|
||||
float var = 0.0f;
|
||||
float2 mean_var = make_float2(0.f, 0.f);
|
||||
|
||||
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[row*ncols + col];
|
||||
mean += xi;
|
||||
var += xi * xi;
|
||||
mean_var.x += xi;
|
||||
mean_var.y += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
|
||||
var += __shfl_xor_sync(0xffffffff, var, mask, 32);
|
||||
mean_var = warp_reduce_sum(mean_var);
|
||||
if (block_size > WARP_SIZE) {
|
||||
__shared__ float2 s_sum[32];
|
||||
int warp_id = threadIdx.x / WARP_SIZE;
|
||||
int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = mean_var;
|
||||
}
|
||||
__syncthreads();
|
||||
mean_var = s_sum[lane_id];
|
||||
mean_var = warp_reduce_sum(mean_var);
|
||||
}
|
||||
|
||||
mean /= ncols;
|
||||
var = var / ncols - mean * mean;
|
||||
const float inv_var = rsqrtf(var + eps);
|
||||
const float mean = mean_var.x / ncols;
|
||||
const float var = mean_var.y / ncols - mean * mean;
|
||||
const float inv_std = rsqrtf(var + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
||||
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
template <int block_size>
|
||||
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[row*ncols + col];
|
||||
tmp += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
if (block_size > WARP_SIZE) {
|
||||
__shared__ float s_sum[32];
|
||||
int warp_id = threadIdx.x / WARP_SIZE;
|
||||
int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
}
|
||||
__syncthreads();
|
||||
tmp = s_sum[lane_id];
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
}
|
||||
|
||||
const float mean = tmp / ncols;
|
||||
const float scale = rsqrtf(mean + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[row*ncols + col] = scale * x[row*ncols + col];
|
||||
}
|
||||
}
|
||||
@@ -737,8 +787,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
||||
const int tid = threadIdx.x;
|
||||
const uint8_t * q = x[i].qs;
|
||||
float * y = yy + i*QK_K;
|
||||
const float d = (float)x[i].d[0];
|
||||
const float m = (float)x[i].d[1];
|
||||
const float d = (float)x[i].dm[0];
|
||||
const float m = (float)x[i].dm[1];
|
||||
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
||||
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
||||
#endif
|
||||
@@ -1155,8 +1205,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
||||
const uint16_t * a = (const uint16_t *)x[i].scales;
|
||||
aux16[0] = a[0] & 0x0f0f;
|
||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
||||
const float d = (float)x[i].d[0];
|
||||
const float m = (float)x[i].d[1];
|
||||
const float d = (float)x[i].dm[0];
|
||||
const float m = (float)x[i].dm[1];
|
||||
float sum = 0.f;
|
||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
||||
@@ -2845,8 +2895,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||
aux16[0] = a[0] & 0x0f0f;
|
||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
||||
|
||||
const float dall = bq4_K->d[0];
|
||||
const float dmin = bq4_K->d[1];
|
||||
const float dall = bq4_K->dm[0];
|
||||
const float dmin = bq4_K->dm[1];
|
||||
|
||||
const float d8_1 = __low2float(bq8_1[0].ds);
|
||||
const float d8_2 = __low2float(bq8_1[1].ds);
|
||||
@@ -2929,7 +2979,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
|
||||
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
||||
|
||||
#if QK_K == 256
|
||||
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
||||
#else
|
||||
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
||||
#endif
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
@@ -3119,7 +3173,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
|
||||
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
||||
|
||||
#if QK_K == 256
|
||||
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
||||
#endif
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
@@ -4180,14 +4236,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
||||
|
||||
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
}
|
||||
}
|
||||
|
||||
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
||||
}
|
||||
}
|
||||
|
||||
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
||||
@@ -4709,6 +4775,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
||||
|
||||
#if QK_K == 256
|
||||
|
||||
int id;
|
||||
CUDA_CHECK(cudaGetDevice(&id));
|
||||
const int compute_capability = g_compute_capabilities[id];
|
||||
@@ -4740,6 +4808,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
||||
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_mul_mat_q4_K_q8_1_cuda(
|
||||
@@ -4899,8 +4968,8 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
||||
|
||||
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
||||
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
||||
GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
|
||||
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
GGML_ASSERT(ncols % 2 == 0);
|
||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
const dim3 block_nums(nrows, num_blocks_x, 1);
|
||||
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
||||
@@ -4908,7 +4977,8 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
||||
|
||||
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
||||
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
||||
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
GGML_ASSERT(ncols % 2 == 0);
|
||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
const dim3 block_nums(nrows, num_blocks_x, 1);
|
||||
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
||||
@@ -6328,9 +6398,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
|
||||
|
||||
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
||||
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const bool is_glm = mode & 4;
|
||||
|
||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
||||
}
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
|
||||
// max memory buffers that can be mapped to the device
|
||||
#define GGML_METAL_MAX_BUFFERS 16
|
||||
#define GGML_METAL_MAX_COMMAND_BUFFERS 32
|
||||
|
||||
struct ggml_tensor;
|
||||
struct ggml_cgraph;
|
||||
|
||||
231
ggml-metal.m
231
ggml-metal.m
@@ -11,6 +11,7 @@
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
// TODO: temporary - reuse llama.cpp logging
|
||||
#ifdef GGML_METAL_NDEBUG
|
||||
#define metal_printf(...)
|
||||
#else
|
||||
@@ -33,12 +34,15 @@ struct ggml_metal_buffer {
|
||||
struct ggml_metal_context {
|
||||
int n_cb;
|
||||
|
||||
float * logits;
|
||||
|
||||
id<MTLDevice> device;
|
||||
id<MTLCommandQueue> queue;
|
||||
id<MTLLibrary> library;
|
||||
|
||||
id<MTLCommandBuffer> command_buffers [GGML_METAL_MAX_COMMAND_BUFFERS];
|
||||
id<MTLComputeCommandEncoder> command_encoders[GGML_METAL_MAX_COMMAND_BUFFERS];
|
||||
|
||||
dispatch_queue_t d_queue;
|
||||
|
||||
int n_buffers;
|
||||
struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
||||
|
||||
@@ -72,6 +76,7 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||
GGML_METAL_DECL_KERNEL(norm);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
||||
@@ -110,16 +115,31 @@ static NSString * const msl_library_source = @"see metal.metal";
|
||||
@end
|
||||
|
||||
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
fprintf(stderr, "%s: allocating\n", __func__);
|
||||
metal_printf("%s: allocating\n", __func__);
|
||||
|
||||
// Show all the Metal device instances in the system
|
||||
NSArray * devices = MTLCopyAllDevices();
|
||||
id <MTLDevice> device;
|
||||
NSString * s;
|
||||
for (device in devices) {
|
||||
s = [device name];
|
||||
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
|
||||
}
|
||||
|
||||
// Pick and show default Metal device
|
||||
device = MTLCreateSystemDefaultDevice();
|
||||
s = [device name];
|
||||
metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
||||
|
||||
// Configure context
|
||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||
|
||||
ctx->n_cb = n_cb;
|
||||
ctx->device = MTLCreateSystemDefaultDevice();
|
||||
ctx->device = device;
|
||||
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
||||
ctx->queue = [ctx->device newCommandQueue];
|
||||
ctx->n_buffers = 0;
|
||||
ctx->concur_list_len = 0;
|
||||
|
||||
ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
#if 0
|
||||
// compile from source string and show compile log
|
||||
@@ -128,7 +148,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
|
||||
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
|
||||
if (error) {
|
||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@@ -142,11 +162,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
|
||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]);
|
||||
metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
|
||||
|
||||
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -158,7 +178,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
|
||||
#endif
|
||||
if (error) {
|
||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@@ -170,11 +190,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
#define GGML_METAL_ADD_KERNEL(name) \
|
||||
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
||||
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
||||
fprintf(stderr, "%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
||||
metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
||||
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
||||
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
||||
if (error) { \
|
||||
fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
return NULL; \
|
||||
}
|
||||
|
||||
@@ -200,6 +220,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||
GGML_METAL_ADD_KERNEL(norm);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
||||
@@ -226,22 +247,81 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
#undef GGML_METAL_ADD_KERNEL
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
if (ctx->device.maxTransferRate != 0) {
|
||||
fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||
metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||
} else {
|
||||
fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
|
||||
metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
|
||||
}
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
fprintf(stderr, "%s: deallocating\n", __func__);
|
||||
metal_printf("%s: deallocating\n", __func__);
|
||||
#define GGML_METAL_DEL_KERNEL(name) \
|
||||
[ctx->function_##name release]; \
|
||||
[ctx->pipeline_##name release];
|
||||
|
||||
GGML_METAL_DEL_KERNEL(add);
|
||||
GGML_METAL_DEL_KERNEL(add_row);
|
||||
GGML_METAL_DEL_KERNEL(mul);
|
||||
GGML_METAL_DEL_KERNEL(mul_row);
|
||||
GGML_METAL_DEL_KERNEL(scale);
|
||||
GGML_METAL_DEL_KERNEL(silu);
|
||||
GGML_METAL_DEL_KERNEL(relu);
|
||||
GGML_METAL_DEL_KERNEL(gelu);
|
||||
GGML_METAL_DEL_KERNEL(soft_max);
|
||||
GGML_METAL_DEL_KERNEL(diag_mask_inf);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_f16);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_q8_0);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_q2_K);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_q3_K);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_q4_K);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_q5_K);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_DEL_KERNEL(rms_norm);
|
||||
GGML_METAL_DEL_KERNEL(norm);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(rope);
|
||||
GGML_METAL_DEL_KERNEL(alibi_f32);
|
||||
GGML_METAL_DEL_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_DEL_KERNEL(cpy_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(cpy_f16_f16);
|
||||
|
||||
#undef GGML_METAL_DEL_KERNEL
|
||||
|
||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||
[ctx->buffers[i].metal release];
|
||||
}
|
||||
|
||||
[ctx->library release];
|
||||
[ctx->queue release];
|
||||
[ctx->device release];
|
||||
|
||||
dispatch_release(ctx->d_queue);
|
||||
|
||||
free(ctx);
|
||||
}
|
||||
|
||||
@@ -249,7 +329,7 @@ void * ggml_metal_host_malloc(size_t n) {
|
||||
void * data = NULL;
|
||||
const int result = posix_memalign((void **) &data, getpagesize(), n);
|
||||
if (result != 0) {
|
||||
fprintf(stderr, "%s: error: posix_memalign failed\n", __func__);
|
||||
metal_printf("%s: error: posix_memalign failed\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -261,7 +341,7 @@ void ggml_metal_host_free(void * data) {
|
||||
}
|
||||
|
||||
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
|
||||
ctx->n_cb = n_cb;
|
||||
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
||||
}
|
||||
|
||||
int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
|
||||
@@ -277,7 +357,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
|
||||
// Metal buffer based on the host memory pointer
|
||||
//
|
||||
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
||||
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
||||
//metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
||||
|
||||
const int64_t tsize = ggml_nbytes(t);
|
||||
|
||||
@@ -288,13 +368,13 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
||||
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
||||
*offs = (size_t) ioffs;
|
||||
|
||||
//fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
||||
//metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
||||
|
||||
return ctx->buffers[i].metal;
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: error: buffer is nil\n", __func__);
|
||||
metal_printf("%s: error: buffer is nil\n", __func__);
|
||||
|
||||
return nil;
|
||||
}
|
||||
@@ -306,7 +386,7 @@ bool ggml_metal_add_buffer(
|
||||
size_t size,
|
||||
size_t max_size) {
|
||||
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
|
||||
fprintf(stderr, "%s: too many buffers\n", __func__);
|
||||
metal_printf("%s: too many buffers\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -316,7 +396,7 @@ bool ggml_metal_add_buffer(
|
||||
const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
|
||||
|
||||
if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
|
||||
fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
|
||||
metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -337,11 +417,11 @@ bool ggml_metal_add_buffer(
|
||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||
|
||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||
metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||
return false;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||
metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||
|
||||
++ctx->n_buffers;
|
||||
} else {
|
||||
@@ -361,27 +441,27 @@ bool ggml_metal_add_buffer(
|
||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||
|
||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
||||
metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
||||
return false;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
||||
metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
||||
if (i + size_step < size) {
|
||||
fprintf(stderr, "\n");
|
||||
metal_printf("\n");
|
||||
}
|
||||
|
||||
++ctx->n_buffers;
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, ", (%8.2f / %8.2f)",
|
||||
metal_printf(", (%8.2f / %8.2f)",
|
||||
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
|
||||
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
||||
fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
|
||||
metal_printf(", warning: current allocated size is greater than the recommended max working set size\n");
|
||||
} else {
|
||||
fprintf(stderr, "\n");
|
||||
metal_printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -391,8 +471,6 @@ bool ggml_metal_add_buffer(
|
||||
void ggml_metal_set_tensor(
|
||||
struct ggml_metal_context * ctx,
|
||||
struct ggml_tensor * t) {
|
||||
metal_printf("%s: set input for tensor '%s'\n", __func__, t->name);
|
||||
|
||||
size_t offs;
|
||||
id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
|
||||
|
||||
@@ -402,8 +480,6 @@ void ggml_metal_set_tensor(
|
||||
void ggml_metal_get_tensor(
|
||||
struct ggml_metal_context * ctx,
|
||||
struct ggml_tensor * t) {
|
||||
metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
|
||||
|
||||
size_t offs;
|
||||
id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
|
||||
|
||||
@@ -498,14 +574,14 @@ void ggml_metal_graph_find_concurrency(
|
||||
}
|
||||
|
||||
if (ctx->concur_list_len > GGML_MAX_CONCUR) {
|
||||
fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
|
||||
metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_metal_graph_compute(
|
||||
struct ggml_metal_context * ctx,
|
||||
struct ggml_cgraph * gf) {
|
||||
metal_printf("%s: evaluating graph\n", __func__);
|
||||
@autoreleasepool {
|
||||
|
||||
// if there is ctx->concur_list, dispatch concurrently
|
||||
// else fallback to serial dispatch
|
||||
@@ -521,29 +597,25 @@ void ggml_metal_graph_compute(
|
||||
|
||||
const int n_cb = ctx->n_cb;
|
||||
|
||||
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
|
||||
|
||||
for (int i = 0; i < n_cb; ++i) {
|
||||
command_buffers[i] = [ctx->queue commandBuffer];
|
||||
ctx->command_buffers[i] = [ctx->queue commandBuffer];
|
||||
|
||||
// enqueue the command buffers in order to specify their execution order
|
||||
[command_buffers[i] enqueue];
|
||||
}
|
||||
[ctx->command_buffers[i] enqueue];
|
||||
|
||||
// TODO: is this the best way to start threads?
|
||||
dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
||||
ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
|
||||
|
||||
dispatch_async(queue, ^{
|
||||
dispatch_async(ctx->d_queue, ^{
|
||||
size_t offs_src0 = 0;
|
||||
size_t offs_src1 = 0;
|
||||
size_t offs_dst = 0;
|
||||
|
||||
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
|
||||
|
||||
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
id<MTLCommandBuffer> command_buffer = ctx->command_buffers[cb_idx];
|
||||
id<MTLComputeCommandEncoder> encoder = ctx->command_encoders[cb_idx];
|
||||
|
||||
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
||||
const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
|
||||
@@ -556,7 +628,7 @@ void ggml_metal_graph_compute(
|
||||
continue;
|
||||
}
|
||||
|
||||
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
||||
//metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
||||
|
||||
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
|
||||
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
|
||||
@@ -625,6 +697,12 @@ void ggml_metal_graph_compute(
|
||||
} break;
|
||||
case GGML_OP_ADD:
|
||||
{
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
// utilize float4
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
const int64_t nb = ne00/4;
|
||||
|
||||
if (ggml_nelements(src1) == ne10) {
|
||||
// src1 is a row
|
||||
[encoder setComputePipelineState:ctx->pipeline_add_row];
|
||||
@@ -634,14 +712,20 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&nb length:sizeof(nb) atIndex:3];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
const int64_t n = ggml_nelements(dst)/4;
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_MUL:
|
||||
{
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
// utilize float4
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
const int64_t nb = ne00/4;
|
||||
|
||||
if (ggml_nelements(src1) == ne10) {
|
||||
// src1 is a row
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_row];
|
||||
@@ -651,9 +735,9 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&nb length:sizeof(nb) atIndex:3];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
const int64_t n = ggml_nelements(dst)/4;
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
@@ -704,7 +788,7 @@ void ggml_metal_graph_compute(
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||
metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
} break;
|
||||
@@ -785,9 +869,13 @@ void ggml_metal_graph_compute(
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
nth0 = 64;
|
||||
nth0 = 32;
|
||||
nth1 = 1;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
||||
if (ne11 * ne12 < 4) {
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
||||
} else {
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
@@ -839,8 +927,8 @@ void ggml_metal_graph_compute(
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
nth0 = 4; //1;
|
||||
nth1 = 8; //32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
@@ -863,7 +951,7 @@ void ggml_metal_graph_compute(
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "Asserting on type %d\n",(int)src0t);
|
||||
metal_printf("Asserting on type %d\n",(int)src0t);
|
||||
GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
};
|
||||
@@ -888,9 +976,12 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
|
||||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
||||
src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q3_K) {
|
||||
#ifdef GGML_QKK_64
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
@@ -904,8 +995,8 @@ void ggml_metal_graph_compute(
|
||||
else if (src0t == GGML_TYPE_Q6_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
} else {
|
||||
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
int64_t ny = (ne11 + 3)/4;
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
}
|
||||
} break;
|
||||
@@ -1101,7 +1192,7 @@ void ggml_metal_graph_compute(
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||
metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
@@ -1117,17 +1208,19 @@ void ggml_metal_graph_compute(
|
||||
}
|
||||
|
||||
// wait for all threads to finish
|
||||
dispatch_barrier_sync(queue, ^{});
|
||||
|
||||
[command_buffers[n_cb - 1] waitUntilCompleted];
|
||||
dispatch_barrier_sync(ctx->d_queue, ^{});
|
||||
|
||||
// check status of command buffers
|
||||
// needed to detect if the device ran out-of-memory for example (#1881)
|
||||
for (int i = 0; i < n_cb; i++) {
|
||||
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
|
||||
[ctx->command_buffers[i] waitUntilCompleted];
|
||||
|
||||
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
|
||||
if (status != MTLCommandBufferStatusCompleted) {
|
||||
fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||
metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
238
ggml-metal.metal
238
ggml-metal.metal
@@ -25,9 +25,9 @@ typedef struct {
|
||||
} block_q8_0;
|
||||
|
||||
kernel void kernel_add(
|
||||
device const float * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
device const float4 * src0,
|
||||
device const float4 * src1,
|
||||
device float4 * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] + src1[tpig];
|
||||
}
|
||||
@@ -35,18 +35,18 @@ kernel void kernel_add(
|
||||
// assumption: src1 is a row
|
||||
// broadcast src1 into src0
|
||||
kernel void kernel_add_row(
|
||||
device const float * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
device const float4 * src0,
|
||||
device const float4 * src1,
|
||||
device float4 * dst,
|
||||
constant int64_t & nb,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] + src1[tpig % ne00];
|
||||
dst[tpig] = src0[tpig] + src1[tpig % nb];
|
||||
}
|
||||
|
||||
kernel void kernel_mul(
|
||||
device const float * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
device const float4 * src0,
|
||||
device const float4 * src1,
|
||||
device float4 * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] * src1[tpig];
|
||||
}
|
||||
@@ -54,12 +54,12 @@ kernel void kernel_mul(
|
||||
// assumption: src1 is a row
|
||||
// broadcast src1 into src0
|
||||
kernel void kernel_mul_row(
|
||||
device const float * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
device const float4 * src0,
|
||||
device const float4 * src1,
|
||||
device float4 * dst,
|
||||
constant int64_t & nb,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] * src1[tpig % ne00];
|
||||
dst[tpig] = src0[tpig] * src1[tpig % nb];
|
||||
}
|
||||
|
||||
kernel void kernel_scale(
|
||||
@@ -133,19 +133,24 @@ kernel void kernel_soft_max(
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (tpitg[0] == 0) {
|
||||
buf[0] = buf[0];
|
||||
}
|
||||
//// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
|
||||
// the loop, and when that is done, buf[0] has the correct (synchronized) value
|
||||
//if (tpitg[0] == 0) {
|
||||
// buf[0] = buf[0];
|
||||
//}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
const float max = buf[0];
|
||||
|
||||
// parallel sum
|
||||
buf[tpitg[0]] = 0.0f;
|
||||
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
||||
buf[tpitg[0]] += exp(psrc0[i00] - max);
|
||||
const float exp_psrc0 = exp(psrc0[i00] - max);
|
||||
buf[tpitg[0]] += exp_psrc0;
|
||||
// Remember the result of exp here. exp is expensive, so we really do not
|
||||
// whish to compute it twice.
|
||||
pdst[i00] = exp_psrc0;
|
||||
}
|
||||
|
||||
// reduce
|
||||
@@ -157,17 +162,18 @@ kernel void kernel_soft_max(
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (tpitg[0] == 0) {
|
||||
buf[0] = buf[0];
|
||||
}
|
||||
// broadcast - not needed, see above
|
||||
//// broadcast
|
||||
//if (tpitg[0] == 0) {
|
||||
// buf[0] = buf[0];
|
||||
//}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
const float sum = buf[0];
|
||||
|
||||
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
||||
pdst[i00] = exp(psrc0[i00] - max) / sum;
|
||||
pdst[i00] /= sum;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -214,25 +220,27 @@ kernel void kernel_norm(
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
// broadcast
|
||||
if (tpitg == 0) {
|
||||
sum[0] /= ne00;
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
//// broadcast
|
||||
//if (tpitg == 0) {
|
||||
// sum[0] /= ne00;
|
||||
//}
|
||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
const float mean = sum[0];
|
||||
|
||||
// recenter
|
||||
// recenter and VARIANCE
|
||||
device float * y = dst + tgpig*ne00;
|
||||
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||
y[i00] = x[i00] - mean;
|
||||
}
|
||||
|
||||
// VARIANCE
|
||||
// parallel sum
|
||||
sum[tpitg] = 0.0f;
|
||||
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||
y[i00] = x[i00] - mean;
|
||||
sum[tpitg] += y[i00] * y[i00];
|
||||
}
|
||||
|
||||
//// VARIANCE
|
||||
//// parallel sum
|
||||
//sum[tpitg] = 0.0f;
|
||||
//for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||
// sum[tpitg] += y[i00] * y[i00];
|
||||
//}
|
||||
// reduce
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
for (uint i = ntg/2; i > 0; i /= 2) {
|
||||
@@ -241,11 +249,11 @@ kernel void kernel_norm(
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
// broadcast
|
||||
if (tpitg == 0) {
|
||||
sum[0] /= ne00;
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
//// broadcast
|
||||
//if (tpitg == 0) {
|
||||
// sum[0] /= ne00;
|
||||
//}
|
||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
const float variance = sum[0];
|
||||
|
||||
const float scale = 1.0f/sqrt(variance + eps);
|
||||
@@ -435,6 +443,8 @@ kernel void kernel_mul_mat_q4_1_f32(
|
||||
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
||||
}
|
||||
|
||||
#define NB_Q8_0 8
|
||||
|
||||
kernel void kernel_mul_mat_q8_0_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
@@ -463,30 +473,30 @@ kernel void kernel_mul_mat_q8_0_f32(
|
||||
device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float yl[16];
|
||||
float yl[NB_Q8_0];
|
||||
float sumf[nr]={0.f};
|
||||
|
||||
const int ix = tiisg/2;
|
||||
const int il = tiisg%2;
|
||||
const int ix = tiisg/4;
|
||||
const int il = tiisg%4;
|
||||
|
||||
device const float * yb = y + ix * QK8_0 + 16*il;
|
||||
device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
|
||||
|
||||
// each thread in a SIMD group deals with half a block.
|
||||
for (int ib = ix; ib < nb; ib += nw/2) {
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
// each thread in a SIMD group deals with NB_Q8_0 quants at a time
|
||||
for (int ib = ix; ib < nb; ib += nw/4) {
|
||||
for (int i = 0; i < NB_Q8_0; ++i) {
|
||||
yl[i] = yb[i];
|
||||
}
|
||||
|
||||
for (int row = 0; row < nr; row++) {
|
||||
device const int8_t * qs = x[ib+row*nb].qs + 16*il;
|
||||
device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
|
||||
float sumq = 0.f;
|
||||
for (int iq = 0; iq < 16; ++iq) {
|
||||
for (int iq = 0; iq < NB_Q8_0; ++iq) {
|
||||
sumq += qs[iq] * yl[iq];
|
||||
}
|
||||
sumf[row] += sumq*x[ib+row*nb].d;
|
||||
}
|
||||
|
||||
yb += QK8_0 * 16;
|
||||
yb += NB_Q8_0 * nw;
|
||||
}
|
||||
|
||||
for (int row = 0; row < nr; ++row) {
|
||||
@@ -497,6 +507,60 @@ kernel void kernel_mul_mat_q8_0_f32(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_f16_f32_1row(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
const int64_t im = tgpig.z;
|
||||
|
||||
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||
|
||||
float sumf = 0;
|
||||
if (ne00 < 128) {
|
||||
for (int i = tiisg; i < ne00; i += 32) {
|
||||
sumf += (float) x[i] * (float) y[i];
|
||||
}
|
||||
float all_sum = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
} else {
|
||||
device const half4 * x4 = (device const half4 *) x;
|
||||
device const float4 * y4 = (device const float4 *) y;
|
||||
for (int i = tiisg; i < ne00/4; i += 32) {
|
||||
for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
|
||||
}
|
||||
float all_sum = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#define N_F16_F32 4
|
||||
|
||||
kernel void kernel_mul_mat_f16_f32(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
@@ -515,37 +579,58 @@ kernel void kernel_mul_mat_f16_f32(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
threadgroup float * sum [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpig[[thread_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 tptg[[threads_per_threadgroup]]) {
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
const int64_t rb = tgpig.y*N_F16_F32;
|
||||
const int64_t im = tgpig.z;
|
||||
|
||||
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
||||
|
||||
sum[tpitg.x] = 0.0f;
|
||||
if (ne00 < 128) {
|
||||
for (int row = 0; row < N_F16_F32; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
for (int i = tpitg.x; i < ne00; i += tptg.x) {
|
||||
sum[tpitg.x] += (float) x[i] * (float) y[i];
|
||||
}
|
||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||
|
||||
// accumulate the sum from all threads in the threadgroup
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
for (uint i = tptg.x/2; i > 0; i /= 2) {
|
||||
if (tpitg.x < i) {
|
||||
sum[tpitg.x] += sum[tpitg.x + i];
|
||||
float sumf = 0;
|
||||
for (int i = tiisg; i < ne00; i += 32) {
|
||||
sumf += (float) x[i] * (float) y[i];
|
||||
}
|
||||
|
||||
float all_sum = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
device const half4 * x4 = (device const half4 *)x;
|
||||
for (int row = 0; row < N_F16_F32; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||
device const float4 * y4 = (device const float4 *) y;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tiisg; i < ne00/4; i += 32) {
|
||||
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
|
||||
}
|
||||
|
||||
float all_sum = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
|
||||
if (tpitg.x == 0) {
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_alibi_f32(
|
||||
@@ -1244,7 +1329,8 @@ kernel void kernel_mul_mat_q4_K_f32(
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int r2 = tgpig.z;
|
||||
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
||||
//const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
||||
const int first_row = r0 * N_DST;
|
||||
const int ib_row = first_row * nb;
|
||||
const uint offset0 = r2/gqa*(nb*ne0);
|
||||
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
|
||||
|
||||
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
||||
return;
|
||||
}
|
||||
|
||||
cl_mem mem = (cl_mem)tensor->data;
|
||||
cl_mem mem = (cl_mem)tensor->extra;
|
||||
clReleaseMemObject(mem);
|
||||
}
|
||||
|
||||
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
||||
size_t d_size;
|
||||
|
||||
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
||||
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
|
||||
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
||||
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
||||
|
||||
|
||||
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
size_t d_size;
|
||||
cl_mem d_X;
|
||||
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
||||
d_X = (cl_mem) src0->data;
|
||||
d_X = (cl_mem) src0->extra;
|
||||
} else {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
||||
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
||||
}
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
size_t d_size;
|
||||
cl_mem d_X;
|
||||
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
||||
d_X = (cl_mem) src0->data;
|
||||
d_X = (cl_mem) src0->extra;
|
||||
} else {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
||||
}
|
||||
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
events.emplace_back();
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||
d_Q = (cl_mem) src0->data;
|
||||
d_Q = (cl_mem) src0->extra;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
||||
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
tensor->data = dst;
|
||||
tensor->extra = dst;
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
}
|
||||
|
||||
64
ggml.h
64
ggml.h
@@ -130,13 +130,16 @@
|
||||
// The data of the tensor is accessed via the "data" pointer. For example:
|
||||
//
|
||||
// {
|
||||
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
|
||||
// const int nx = 2;
|
||||
// const int ny = 3;
|
||||
//
|
||||
// // a[2, 1] = 1.0f;
|
||||
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
|
||||
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
|
||||
//
|
||||
// // a[0, 2] = 2.0f;
|
||||
// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
|
||||
// for (int y = 0; y < ny; y++) {
|
||||
// for (int x = 0; x < nx; x++) {
|
||||
// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// ...
|
||||
// }
|
||||
@@ -211,12 +214,17 @@
|
||||
#define GGML_MAX_OP_PARAMS 32
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
|
||||
#if UINTPTR_MAX == 0xFFFFFFFF
|
||||
#define GGML_MEM_ALIGN 4
|
||||
#else
|
||||
#define GGML_MEM_ALIGN 16
|
||||
#endif
|
||||
|
||||
#define GGML_EXIT_SUCCESS 0
|
||||
#define GGML_EXIT_ABORTED 1
|
||||
|
||||
#define GGUF_MAGIC 0x46554747 // "GGUF"
|
||||
#define GGUF_VERSION 1
|
||||
#define GGUF_VERSION 2
|
||||
|
||||
#define GGUF_DEFAULT_ALIGNMENT 32
|
||||
|
||||
@@ -471,6 +479,9 @@ extern "C" {
|
||||
int64_t perf_cycles;
|
||||
int64_t perf_time_us;
|
||||
|
||||
struct ggml_tensor * view_src;
|
||||
size_t view_offs;
|
||||
|
||||
void * data;
|
||||
|
||||
char name[GGML_MAX_NAME];
|
||||
@@ -653,7 +664,7 @@ extern "C" {
|
||||
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
||||
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
|
||||
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
||||
|
||||
@@ -944,11 +955,11 @@ extern "C" {
|
||||
|
||||
// a - x
|
||||
// b - dy
|
||||
// TODO: update with configurable eps
|
||||
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
struct ggml_tensor * b,
|
||||
float eps);
|
||||
|
||||
// A: n columns, m rows
|
||||
// B: n columns, p rows (i.e. we transpose it internally)
|
||||
@@ -1604,7 +1615,8 @@ extern "C" {
|
||||
struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
||||
|
||||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
||||
@@ -1669,6 +1681,8 @@ extern "C" {
|
||||
GGML_LINESEARCH_INVALID_PARAMETERS,
|
||||
};
|
||||
|
||||
typedef void (*ggml_opt_callback)(void * data, float * sched);
|
||||
|
||||
// optimization parameters
|
||||
//
|
||||
// see ggml.c (ggml_opt_default_params) for default values
|
||||
@@ -1704,12 +1718,14 @@ extern "C" {
|
||||
|
||||
float sched; // schedule multiplier (fixed, decay or warmup)
|
||||
float decay; // weight decay for AdamW, use 0.0f to disable
|
||||
int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
|
||||
float alpha; // learning rate
|
||||
float beta1;
|
||||
float beta2;
|
||||
float eps; // epsilon for numerical stability
|
||||
float eps_f; // epsilon for convergence test
|
||||
float eps_g; // epsilon for convergence test
|
||||
float gclip; // gradient clipping
|
||||
} adam;
|
||||
|
||||
// LBFGS parameters
|
||||
@@ -1737,14 +1753,12 @@ extern "C" {
|
||||
|
||||
bool just_initialized;
|
||||
|
||||
float loss_before;
|
||||
float loss_after;
|
||||
|
||||
struct {
|
||||
struct ggml_tensor * x; // view of the parameters
|
||||
struct ggml_tensor * g1; // gradient
|
||||
struct ggml_tensor * g2; // gradient squared
|
||||
struct ggml_tensor * m; // first moment
|
||||
struct ggml_tensor * v; // second moment
|
||||
struct ggml_tensor * mh; // first moment hat
|
||||
struct ggml_tensor * vh; // second moment hat
|
||||
struct ggml_tensor * pf; // past function values
|
||||
float fx_best;
|
||||
float fx_prev;
|
||||
@@ -1781,10 +1795,10 @@ extern "C" {
|
||||
|
||||
// initialize optimizer context
|
||||
GGML_API void ggml_opt_init(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_opt_context * opt,
|
||||
struct ggml_opt_params params,
|
||||
int64_t nx);
|
||||
struct ggml_opt_params params,
|
||||
int64_t nx);
|
||||
|
||||
// continue optimizing the function defined by the tensor f
|
||||
GGML_API enum ggml_opt_result ggml_opt_resume(
|
||||
@@ -1798,7 +1812,9 @@ extern "C" {
|
||||
struct ggml_opt_context * opt,
|
||||
struct ggml_tensor * f,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_cgraph * gb);
|
||||
struct ggml_cgraph * gb,
|
||||
ggml_opt_callback callback,
|
||||
void * callback_data);
|
||||
|
||||
//
|
||||
// quantization
|
||||
@@ -1827,6 +1843,9 @@ extern "C" {
|
||||
GGUF_TYPE_BOOL = 7,
|
||||
GGUF_TYPE_STRING = 8,
|
||||
GGUF_TYPE_ARRAY = 9,
|
||||
GGUF_TYPE_UINT64 = 10,
|
||||
GGUF_TYPE_INT64 = 11,
|
||||
GGUF_TYPE_FLOAT64 = 12,
|
||||
GGUF_TYPE_COUNT, // marks the end of the enum
|
||||
};
|
||||
|
||||
@@ -1867,6 +1886,9 @@ extern "C" {
|
||||
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
|
||||
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
|
||||
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
|
||||
GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
|
||||
GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
|
||||
GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
|
||||
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
||||
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
||||
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
||||
@@ -1886,6 +1908,9 @@ extern "C" {
|
||||
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
||||
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
||||
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
||||
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
|
||||
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
|
||||
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
|
||||
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
||||
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
||||
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
||||
@@ -1944,6 +1969,7 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_clblast (void);
|
||||
GGML_API int ggml_cpu_has_gpublas (void);
|
||||
GGML_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
|
||||
//
|
||||
|
||||
@@ -27,8 +27,25 @@ In this case, upgrade Pip to the latest:
|
||||
pip install --upgrade pip
|
||||
```
|
||||
|
||||
## Publishing
|
||||
To publish the package, you need to have `twine` and `build` installed:
|
||||
## Automatic publishing with CI
|
||||
|
||||
There's a GitHub workflow to make a release automatically upon creation of tags in a specified format.
|
||||
|
||||
1. Bump the version in `pyproject.toml`.
|
||||
2. Create a tag named `gguf-vx.x.x` where `x.x.x` is the semantic version number.
|
||||
|
||||
```sh
|
||||
git tag -a gguf-v1.0.0 -m "Version 1.0 release"
|
||||
```
|
||||
|
||||
3. Push the tags.
|
||||
|
||||
```sh
|
||||
git push origin --tags
|
||||
```
|
||||
|
||||
## Manual publishing
|
||||
If you want to publish the package manually for any reason, you need to have `twine` and `build` installed:
|
||||
|
||||
```sh
|
||||
pip install build twine
|
||||
@@ -36,7 +53,7 @@ pip install build twine
|
||||
|
||||
Then, folow these steps to release a new version:
|
||||
|
||||
1. Update the version in `pyproject.toml`.
|
||||
1. Bump the version in `pyproject.toml`.
|
||||
2. Build the package:
|
||||
|
||||
```sh
|
||||
|
||||
@@ -1,19 +1,25 @@
|
||||
#!/usr/bin/env python3
|
||||
import shutil
|
||||
import sys
|
||||
import struct
|
||||
import tempfile
|
||||
import numpy as np
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import struct
|
||||
import sys
|
||||
import tempfile
|
||||
from enum import IntEnum, auto
|
||||
from typing import Any, IO, List, Optional
|
||||
from io import BufferedWriter
|
||||
from pathlib import Path
|
||||
from typing import IO, Any, BinaryIO, Callable, Sequence
|
||||
|
||||
import numpy as np
|
||||
|
||||
#
|
||||
# constants
|
||||
#
|
||||
|
||||
GGUF_MAGIC = 0x46554747
|
||||
GGUF_VERSION = 1
|
||||
GGUF_VERSION = 2
|
||||
GGUF_DEFAULT_ALIGNMENT = 32
|
||||
|
||||
# general
|
||||
@@ -71,35 +77,35 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
|
||||
|
||||
|
||||
class MODEL_ARCH(IntEnum):
|
||||
LLAMA = auto()
|
||||
FALCON = auto()
|
||||
GPT2 = auto()
|
||||
GPTJ = auto()
|
||||
GPTNEOX = auto()
|
||||
MPT = auto()
|
||||
LLAMA : int = auto()
|
||||
FALCON : int = auto()
|
||||
GPT2 : int = auto()
|
||||
GPTJ : int = auto()
|
||||
GPTNEOX: int = auto()
|
||||
MPT : int = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
TOKEN_EMBD = auto()
|
||||
POS_EMBD = auto()
|
||||
OUTPUT = auto()
|
||||
OUTPUT_NORM = auto()
|
||||
ROPE_FREQS = auto()
|
||||
ATTN_Q = auto()
|
||||
ATTN_K = auto()
|
||||
ATTN_V = auto()
|
||||
ATTN_QKV = auto()
|
||||
ATTN_OUT = auto()
|
||||
ATTN_NORM = auto()
|
||||
ATTN_NORM_2 = auto()
|
||||
ATTN_ROT_EMBD = auto()
|
||||
FFN_GATE = auto()
|
||||
FFN_DOWN = auto()
|
||||
FFN_UP = auto()
|
||||
FFN_NORM = auto()
|
||||
TOKEN_EMBD : int = auto()
|
||||
POS_EMBD : int = auto()
|
||||
OUTPUT : int = auto()
|
||||
OUTPUT_NORM : int = auto()
|
||||
ROPE_FREQS : int = auto()
|
||||
ATTN_Q : int = auto()
|
||||
ATTN_K : int = auto()
|
||||
ATTN_V : int = auto()
|
||||
ATTN_QKV : int = auto()
|
||||
ATTN_OUT : int = auto()
|
||||
ATTN_NORM : int = auto()
|
||||
ATTN_NORM_2 : int = auto()
|
||||
ATTN_ROT_EMBD: int = auto()
|
||||
FFN_GATE : int = auto()
|
||||
FFN_DOWN : int = auto()
|
||||
FFN_UP : int = auto()
|
||||
FFN_NORM : int = auto()
|
||||
|
||||
|
||||
MODEL_ARCH_NAMES = {
|
||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.LLAMA: "llama",
|
||||
MODEL_ARCH.FALCON: "falcon",
|
||||
MODEL_ARCH.GPT2: "gpt2",
|
||||
@@ -108,7 +114,7 @@ MODEL_ARCH_NAMES = {
|
||||
MODEL_ARCH.MPT: "mpt",
|
||||
}
|
||||
|
||||
MODEL_TENSOR_NAMES = {
|
||||
MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
|
||||
MODEL_ARCH.LLAMA: {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
@@ -154,7 +160,7 @@ MODEL_TENSOR_NAMES = {
|
||||
}
|
||||
|
||||
# tensors that will not be serialized
|
||||
MODEL_TENSOR_SKIP = {
|
||||
MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_ARCH.LLAMA: [
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
@@ -162,167 +168,198 @@ MODEL_TENSOR_SKIP = {
|
||||
}
|
||||
|
||||
|
||||
# TODO: the following helper functions should be removed
|
||||
# instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR)
|
||||
# however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
|
||||
# REMOVE
|
||||
def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
|
||||
for skip in MODEL_TENSOR_SKIP.get(arch, []):
|
||||
for i in range(n_blocks):
|
||||
if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
|
||||
return True
|
||||
class TensorNameMap:
|
||||
mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
# Token embeddings
|
||||
MODEL_TENSOR.TOKEN_EMBD: (
|
||||
"gpt_neox.embed_in", # gptneox
|
||||
"transformer.wte", # gpt2 mpt
|
||||
"transformer.word_embeddings", # falcon
|
||||
"model.embed_tokens", # llama-hf
|
||||
"tok_embeddings", # llama-pth
|
||||
),
|
||||
|
||||
return False
|
||||
# Position embeddings
|
||||
MODEL_TENSOR.POS_EMBD: (
|
||||
"transformer.wpe", # gpt2
|
||||
),
|
||||
|
||||
# Output
|
||||
MODEL_TENSOR.OUTPUT: (
|
||||
"embed_out", # gptneox
|
||||
"lm_head", # gpt2 mpt falcon llama-hf
|
||||
"output", # llama-pth
|
||||
),
|
||||
|
||||
def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
|
||||
tensor_map = {}
|
||||
# Output norm
|
||||
MODEL_TENSOR.OUTPUT_NORM: (
|
||||
"gpt_neox.final_layer_norm", # gptneox
|
||||
"transformer.ln_f", # gpt2 falcon
|
||||
"model.norm", # llama-hf
|
||||
"norm", # llama-pth
|
||||
),
|
||||
|
||||
# Token embeddings
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None)
|
||||
# Rope frequencies
|
||||
MODEL_TENSOR.ROPE_FREQS: (
|
||||
"rope.freqs", # llama-pth
|
||||
),
|
||||
}
|
||||
|
||||
tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox
|
||||
tensor_map["transformer.wte"] = mapped_to # gpt2 mpt
|
||||
tensor_map["transformer.word_embeddings"] = mapped_to # falcon
|
||||
tensor_map["model.embed_tokens"] = mapped_to # llama-hf
|
||||
tensor_map["tok_embeddings"] = mapped_to # llama-pth
|
||||
|
||||
# Position embeddings
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None)
|
||||
|
||||
tensor_map["transformer.wpe"] = mapped_to # gpt2
|
||||
|
||||
# Output
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None)
|
||||
|
||||
tensor_map["embed_out"] = mapped_to # gptneox
|
||||
tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf
|
||||
tensor_map["output"] = mapped_to # llama-pth
|
||||
|
||||
# Output norm
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None)
|
||||
|
||||
tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
|
||||
tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon
|
||||
tensor_map["transformer.norm_f"] = mapped_to # mpt
|
||||
tensor_map["model.norm"] = mapped_to # llama-hf
|
||||
tensor_map["norm"] = mapped_to # llama-pth
|
||||
|
||||
# Rope frequencies
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None)
|
||||
|
||||
tensor_map["rope.freqs"] = mapped_to # llama-pth
|
||||
|
||||
# Attention and feed-forward blocks
|
||||
for i in range(0, n_blocks):
|
||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
# Attention norm
|
||||
# TODO: is there are simpler way to write these 2 lines in Python?
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to else None
|
||||
|
||||
tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
|
||||
tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2
|
||||
tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt
|
||||
tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b
|
||||
tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
|
||||
tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth
|
||||
MODEL_TENSOR.ATTN_NORM: (
|
||||
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||
"transformer.h.{bid}.ln_1", # gpt2
|
||||
"transformer.blocks.{bid}.norm_1", # mpt
|
||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||
"model.layers.{bid}.input_layernorm", # llama-hf
|
||||
"layers.{bid}.attention_norm", # llama-pth
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b
|
||||
MODEL_TENSOR.ATTN_NORM_2: (
|
||||
"transformer.h.{bid}.ln_attn", # falcon40b
|
||||
),
|
||||
|
||||
# Attention query-key-value
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox
|
||||
tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2
|
||||
tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt
|
||||
tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
|
||||
MODEL_TENSOR.ATTN_QKV: (
|
||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||
"transformer.h.{bid}.attn.c_attn", # gpt2
|
||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||
),
|
||||
|
||||
# Attention query
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth
|
||||
MODEL_TENSOR.ATTN_Q: (
|
||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
||||
"layers.{bid}.attention.wq", # llama-pth
|
||||
),
|
||||
|
||||
# Attention key
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth
|
||||
MODEL_TENSOR.ATTN_K: (
|
||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
||||
"layers.{bid}.attention.wk", # llama-pth
|
||||
),
|
||||
|
||||
# Attention value
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth
|
||||
MODEL_TENSOR.ATTN_V: (
|
||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
||||
"layers.{bid}.attention.wv", # llama-pth
|
||||
),
|
||||
|
||||
# Attention output
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox
|
||||
tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2
|
||||
tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt
|
||||
tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
|
||||
tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth
|
||||
MODEL_TENSOR.ATTN_OUT: (
|
||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||
"transformer.h.{bid}.attn.c_proj", # gpt2
|
||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
||||
"layers.{bid}.attention.wo", # llama-pth
|
||||
),
|
||||
|
||||
# Rotary embeddings
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to # llama-pth
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
||||
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
||||
"layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
|
||||
),
|
||||
|
||||
# Feed-forward norm
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
|
||||
tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2
|
||||
tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt
|
||||
tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth
|
||||
MODEL_TENSOR.FFN_NORM: (
|
||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||
"transformer.h.{bid}.ln_2", # gpt2
|
||||
"transformer.blocks.{bid}.norm_2", # mpt
|
||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||
"layers.{bid}.ffn_norm", # llama-pth
|
||||
),
|
||||
|
||||
# Feed-forward up
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
|
||||
tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2
|
||||
tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt
|
||||
tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon
|
||||
tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth
|
||||
MODEL_TENSOR.FFN_UP: (
|
||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
||||
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
||||
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||
"model.layers.{bid}.mlp.up_proj", # llama-hf
|
||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||
),
|
||||
|
||||
# Feed-forward gate
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
|
||||
tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth
|
||||
MODEL_TENSOR.FFN_GATE: (
|
||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf
|
||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||
),
|
||||
|
||||
# Feed-forward down
|
||||
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None)
|
||||
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
|
||||
MODEL_TENSOR.FFN_DOWN: (
|
||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||
"transformer.h.{bid}.mlp.c_proj", # gpt2
|
||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
||||
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||
),
|
||||
}
|
||||
|
||||
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
|
||||
tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2
|
||||
tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt
|
||||
tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon
|
||||
tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf
|
||||
tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth
|
||||
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
||||
|
||||
return tensor_map
|
||||
tensor_names: dict[MODEL_TENSOR, str]
|
||||
|
||||
def __init__(self, arch: MODEL_ARCH, n_blocks: int):
|
||||
mapping = self.mapping = {}
|
||||
tensor_names = self.tensor_names = MODEL_TENSOR_NAMES[arch]
|
||||
for tensor, keys in self.mappings_cfg.items():
|
||||
tensor_name = tensor_names.get(tensor)
|
||||
if tensor_name is None:
|
||||
continue
|
||||
for key in keys:
|
||||
mapping[key] = (tensor, tensor_name)
|
||||
for bid in range(n_blocks):
|
||||
for tensor, keys in self.block_mappings_cfg.items():
|
||||
tensor_name = tensor_names.get(tensor)
|
||||
if tensor_name is None:
|
||||
continue
|
||||
tensor_name = tensor_name.format(bid = bid)
|
||||
for key in keys:
|
||||
key = key.format(bid = bid)
|
||||
mapping[key] = (tensor, tensor_name)
|
||||
|
||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> tuple[MODEL_TENSOR, str] | None:
|
||||
result = self.mapping.get(key)
|
||||
if result is not None:
|
||||
return result
|
||||
for suffix in try_suffixes:
|
||||
if key.endswith(suffix):
|
||||
result = self.mapping.get(key[:-len(suffix)])
|
||||
if result is not None:
|
||||
return (result[0], result[1] + suffix)
|
||||
return None
|
||||
|
||||
def get_name(self, key: str, try_suffixes: Sequence[str]) -> str | None:
|
||||
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
||||
if result is None:
|
||||
return None
|
||||
return result[1]
|
||||
|
||||
def get_type(self, key: str, try_suffixes: Sequence[str]) -> MODEL_TENSOR | None:
|
||||
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
||||
if result is None:
|
||||
return None
|
||||
return result[0]
|
||||
|
||||
def __getitem__(self, key: str) -> str:
|
||||
try:
|
||||
return self.mapping[key][1]
|
||||
except KeyError:
|
||||
raise KeyError(key)
|
||||
|
||||
def __contains__(self, key: str) -> bool:
|
||||
return key in self.mapping
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return repr(self.mapping)
|
||||
|
||||
def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
|
||||
return TensorNameMap(arch, n_blocks)
|
||||
|
||||
class TokenType(IntEnum):
|
||||
NORMAL = 1
|
||||
@@ -365,6 +402,9 @@ class GGUFValueType(IntEnum):
|
||||
BOOL = 7
|
||||
STRING = 8
|
||||
ARRAY = 9
|
||||
UINT64 = 10
|
||||
INT64 = 11
|
||||
FLOAT64 = 12
|
||||
|
||||
@staticmethod
|
||||
def get_type(val):
|
||||
@@ -378,21 +418,28 @@ class GGUFValueType(IntEnum):
|
||||
return GGUFValueType.BOOL
|
||||
elif isinstance(val, int):
|
||||
return GGUFValueType.INT32
|
||||
# TODO: need help with 64-bit types in Python
|
||||
else:
|
||||
print("Unknown type: "+str(type(val)))
|
||||
sys.exit()
|
||||
|
||||
|
||||
class GGUFWriter:
|
||||
def __init__(self, path: str, arch: str, use_temp_file = True):
|
||||
fout: BufferedWriter
|
||||
arch: str
|
||||
offset_tensor = 0
|
||||
data_alignment = GGUF_DEFAULT_ALIGNMENT
|
||||
kv_data = b""
|
||||
kv_data_count = 0
|
||||
ti_data = b""
|
||||
ti_data_count = 0
|
||||
use_temp_file: bool
|
||||
temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
|
||||
tensors: list[tuple[np.ndarray[Any, Any], int]]
|
||||
|
||||
def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
|
||||
self.fout = open(path, "wb")
|
||||
self.arch = arch
|
||||
self.offset_tensor = 0
|
||||
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
|
||||
self.kv_data = b""
|
||||
self.kv_data_count = 0
|
||||
self.ti_data = b""
|
||||
self.ti_data_count = 0
|
||||
self.add_architecture()
|
||||
self.use_temp_file = use_temp_file
|
||||
self.tensors = []
|
||||
@@ -400,8 +447,8 @@ class GGUFWriter:
|
||||
def write_header_to_file(self):
|
||||
self.fout.write(struct.pack("<I", GGUF_MAGIC))
|
||||
self.fout.write(struct.pack("<I", GGUF_VERSION))
|
||||
self.fout.write(struct.pack("<I", self.ti_data_count))
|
||||
self.fout.write(struct.pack("<I", self.kv_data_count))
|
||||
self.fout.write(struct.pack("<Q", self.ti_data_count))
|
||||
self.fout.write(struct.pack("<Q", self.kv_data_count))
|
||||
self.flush()
|
||||
# print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
|
||||
|
||||
@@ -444,6 +491,18 @@ class GGUFWriter:
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.FLOAT32)
|
||||
|
||||
def add_uint64(self, key: str, val: int):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.UINT64)
|
||||
|
||||
def add_int64(self, key: str, val: int):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.INT64)
|
||||
|
||||
def add_float64(self, key: str, val: float):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.FLOAT64)
|
||||
|
||||
def add_bool(self, key: str, val: bool):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.BOOL)
|
||||
@@ -454,14 +513,27 @@ class GGUFWriter:
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.STRING)
|
||||
|
||||
def add_array(self, key: str, val: list):
|
||||
if not isinstance(val, list):
|
||||
raise ValueError("Value must be a list for array type")
|
||||
def add_array(self, key: str, val: Sequence[Any]):
|
||||
if not isinstance(val, Sequence):
|
||||
raise ValueError("Value must be a sequence for array type")
|
||||
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.ARRAY)
|
||||
|
||||
def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True):
|
||||
_simple_value_packing = {
|
||||
GGUFValueType.UINT8: "<B",
|
||||
GGUFValueType.INT8: "<b",
|
||||
GGUFValueType.UINT16: "<H",
|
||||
GGUFValueType.INT16: "<h",
|
||||
GGUFValueType.UINT32: "<I",
|
||||
GGUFValueType.INT32: "<i",
|
||||
GGUFValueType.FLOAT32: "<f",
|
||||
GGUFValueType.UINT64: "<Q",
|
||||
GGUFValueType.INT64: "<q",
|
||||
GGUFValueType.FLOAT64: "<d",
|
||||
GGUFValueType.BOOL: "?" ,
|
||||
}
|
||||
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
|
||||
if vtype is None:
|
||||
vtype = GGUFValueType.get_type(val)
|
||||
|
||||
@@ -469,50 +541,38 @@ class GGUFWriter:
|
||||
self.kv_data += struct.pack("<I", vtype)
|
||||
self.kv_data_count += 1
|
||||
|
||||
if vtype == GGUFValueType.UINT8:
|
||||
self.kv_data += struct.pack("<B", val)
|
||||
elif vtype == GGUFValueType.INT8:
|
||||
self.kv_data += struct.pack("<b", val)
|
||||
elif vtype == GGUFValueType.UINT16:
|
||||
self.kv_data += struct.pack("<H", val)
|
||||
elif vtype == GGUFValueType.INT16:
|
||||
self.kv_data += struct.pack("<h", val)
|
||||
elif vtype == GGUFValueType.UINT32:
|
||||
self.kv_data += struct.pack("<I", val)
|
||||
elif vtype == GGUFValueType.INT32:
|
||||
self.kv_data += struct.pack("<i", val)
|
||||
elif vtype == GGUFValueType.FLOAT32:
|
||||
self.kv_data += struct.pack("<f", val)
|
||||
elif vtype == GGUFValueType.BOOL:
|
||||
self.kv_data += struct.pack("?", val)
|
||||
pack_fmt = self._simple_value_packing.get(vtype)
|
||||
if pack_fmt is not None:
|
||||
self.kv_data += struct.pack(pack_fmt, val)
|
||||
elif vtype == GGUFValueType.STRING:
|
||||
encoded_val = val.encode("utf8") if isinstance(val, str) else val
|
||||
self.kv_data += struct.pack("<I", len(encoded_val))
|
||||
self.kv_data += struct.pack("<Q", len(encoded_val))
|
||||
self.kv_data += encoded_val
|
||||
elif vtype == GGUFValueType.ARRAY:
|
||||
ltype = set([GGUFValueType.get_type(item) for item in val])
|
||||
assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
|
||||
self.kv_data += struct.pack("<I", list(ltype)[0])
|
||||
self.kv_data += struct.pack("<I", len(val))
|
||||
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
|
||||
ltype = GGUFValueType.get_type(val[0])
|
||||
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
|
||||
raise ValueError("All items in a GGUF array should be of the same type")
|
||||
self.kv_data += struct.pack("<I", ltype)
|
||||
self.kv_data += struct.pack("<Q", len(val))
|
||||
for item in val:
|
||||
self.add_val(item, add_vtype=False)
|
||||
else:
|
||||
raise ValueError("Invalid GGUF metadata value type")
|
||||
raise ValueError("Invalid GGUF metadata value type or value")
|
||||
|
||||
@staticmethod
|
||||
def ggml_pad(x: int, n: int) -> int:
|
||||
return ((x + n - 1) // n) * n
|
||||
|
||||
def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
|
||||
def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32], tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None):
|
||||
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
|
||||
|
||||
encoded_name = name.encode("utf8")
|
||||
self.ti_data += struct.pack("<I", len(encoded_name))
|
||||
self.ti_data += struct.pack("<Q", len(encoded_name))
|
||||
self.ti_data += encoded_name
|
||||
n_dims = len(tensor_shape)
|
||||
self.ti_data += struct.pack("<I", n_dims)
|
||||
for i in range(n_dims):
|
||||
self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
|
||||
self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
|
||||
if raw_dtype is None:
|
||||
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
|
||||
else:
|
||||
@@ -522,16 +582,18 @@ class GGUFWriter:
|
||||
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
|
||||
self.ti_data_count += 1
|
||||
|
||||
def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
|
||||
if self.use_temp_file and not hasattr(self, "temp_file"):
|
||||
self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
|
||||
self.temp_file.seek(0)
|
||||
def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
|
||||
if self.use_temp_file and self.temp_file is None:
|
||||
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
|
||||
fp.seek(0)
|
||||
self.temp_file = fp
|
||||
|
||||
self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
|
||||
shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
|
||||
self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
|
||||
|
||||
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
|
||||
|
||||
if not self.use_temp_file:
|
||||
if self.temp_file is None:
|
||||
self.tensors.append((tensor, pad))
|
||||
return
|
||||
|
||||
@@ -540,25 +602,22 @@ class GGUFWriter:
|
||||
if pad != 0:
|
||||
self.temp_file.write(bytes([0] * pad))
|
||||
|
||||
def write_tensor_data(self, tensor: np.ndarray):
|
||||
pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
|
||||
def write_padding(self, fp: BinaryIO, n: int, align: int | None = None):
|
||||
pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
|
||||
if pad != 0:
|
||||
self.fout.write(bytes([0] * pad))
|
||||
fp.write(bytes([0] * pad))
|
||||
|
||||
def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
|
||||
self.write_padding(self.fout, self.fout.tell())
|
||||
tensor.tofile(self.fout)
|
||||
|
||||
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
|
||||
if pad != 0:
|
||||
self.fout.write(bytes([0] * pad))
|
||||
self.write_padding(self.fout, tensor.nbytes)
|
||||
|
||||
def write_tensors_to_file(self):
|
||||
self.write_ti_data_to_file()
|
||||
|
||||
pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
|
||||
if pad != 0:
|
||||
self.fout.write(bytes([0] * pad))
|
||||
self.write_padding(self.fout, self.fout.tell())
|
||||
|
||||
if not self.use_temp_file:
|
||||
if self.temp_file is None:
|
||||
for (currtensor, currpad) in self.tensors:
|
||||
currtensor.tofile(self.fout)
|
||||
if currpad != 0:
|
||||
@@ -632,10 +691,6 @@ class GGUFWriter:
|
||||
self.add_bool(
|
||||
KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||
|
||||
def add_tensor_data_layout(self, layout: str):
|
||||
self.add_string(
|
||||
KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||
|
||||
def add_head_count(self, count: int):
|
||||
self.add_uint32(
|
||||
KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
|
||||
@@ -673,16 +728,16 @@ class GGUFWriter:
|
||||
def add_tokenizer_model(self, model: str):
|
||||
self.add_string(KEY_TOKENIZER_MODEL, model)
|
||||
|
||||
def add_token_list(self, tokens: List):
|
||||
def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]):
|
||||
self.add_array(KEY_TOKENIZER_LIST, tokens)
|
||||
|
||||
def add_token_merges(self, merges: List):
|
||||
def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]):
|
||||
self.add_array(KEY_TOKENIZER_MERGES, merges)
|
||||
|
||||
def add_token_types(self, types: List[int]):
|
||||
def add_token_types(self, types: Sequence[TokenType] | Sequence[int]):
|
||||
self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
|
||||
|
||||
def add_token_scores(self, scores: List[float]):
|
||||
def add_token_scores(self, scores: Sequence[float]):
|
||||
self.add_array(KEY_TOKENIZER_SCORES, scores)
|
||||
|
||||
def add_bos_token_id(self, id: int):
|
||||
@@ -701,6 +756,84 @@ class GGUFWriter:
|
||||
self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
|
||||
|
||||
|
||||
class SpecialVocab:
|
||||
load_merges: bool = False
|
||||
merges: list[str] = []
|
||||
special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
|
||||
special_token_ids: dict[str, int] = {}
|
||||
|
||||
def __init__(self, path: Path, load_merges: bool = False, special_token_types: tuple[str, ...] | None = None):
|
||||
self.special_token_ids = {}
|
||||
self.load_merges = load_merges
|
||||
if special_token_types is not None:
|
||||
self.special_token_types = special_token_types
|
||||
self.load(path)
|
||||
|
||||
def load(self, path: Path):
|
||||
if not self.try_load_from_tokenizer_json(path):
|
||||
self.try_load_from_config_json(path)
|
||||
|
||||
def try_load_from_tokenizer_json(self, path: Path) -> bool:
|
||||
tokenizer_file = path / 'tokenizer.json'
|
||||
if not tokenizer_file.is_file():
|
||||
return False
|
||||
with open(tokenizer_file, 'r', encoding = 'utf-8') as f:
|
||||
tokenizer = json.load(f)
|
||||
if self.load_merges:
|
||||
merges = tokenizer.get('model', {}).get('merges')
|
||||
if isinstance(merges, list) and len(merges) > 0 and isinstance(merges[0], str):
|
||||
self.merges = merges
|
||||
tokenizer_config_file = path / 'tokenizer_config.json'
|
||||
added_tokens = tokenizer.get('added_tokens')
|
||||
if added_tokens is None or not tokenizer_config_file.is_file():
|
||||
return True
|
||||
with open(tokenizer_config_file, 'r', encoding = 'utf-8') as f:
|
||||
tokenizer_config = json.load(f)
|
||||
for typ in self.special_token_types:
|
||||
entry = tokenizer_config.get(f'{typ}_token')
|
||||
if isinstance(entry, str):
|
||||
tc_content = entry
|
||||
elif isinstance(entry, dict):
|
||||
entry_content = entry.get('content')
|
||||
if not isinstance(entry_content, str):
|
||||
continue
|
||||
tc_content = entry_content
|
||||
else:
|
||||
continue
|
||||
for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
|
||||
if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
|
||||
self.special_token_ids[typ] = maybe_token_id
|
||||
break
|
||||
return True
|
||||
|
||||
def try_load_from_config_json(self, path: Path) -> bool:
|
||||
config_file = path / 'config.json'
|
||||
if not config_file.is_file():
|
||||
return False
|
||||
with open(config_file, 'r', encoding = 'utf-8') as f:
|
||||
config = json.load(f)
|
||||
for typ in self.special_token_types:
|
||||
maybe_token_id = config.get(f'{typ}_token_id')
|
||||
if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
|
||||
self.special_token_ids[typ] = maybe_token_id
|
||||
return True
|
||||
|
||||
def add_to_gguf(self, gw: GGUFWriter):
|
||||
if len(self.merges) > 0:
|
||||
print(f'gguf: Adding {len(self.merges)} merge(s).')
|
||||
gw.add_token_merges(self.merges)
|
||||
for typ, tokid in self.special_token_ids.items():
|
||||
handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
|
||||
if handler is None:
|
||||
print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
|
||||
continue
|
||||
print(f'gguf: Setting special token type {typ} to {tokid}')
|
||||
handler(tokid)
|
||||
|
||||
def __repr__(self):
|
||||
return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids if self.special_token_ids else "unset"}>'
|
||||
|
||||
|
||||
# Example usage:
|
||||
if __name__ == "__main__":
|
||||
# Example usage with a file
|
||||
|
||||
0
gguf-py/gguf/py.typed
Normal file
0
gguf-py/gguf/py.typed
Normal file
@@ -1,10 +1,11 @@
|
||||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.2.1"
|
||||
version = "0.3.2"
|
||||
description = "Write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
{include = "gguf"},
|
||||
{include = "gguf/py.typed"},
|
||||
]
|
||||
readme = "README.md"
|
||||
homepage = "https://ggml.ai"
|
||||
|
||||
42
grammars/c.gbnf
Normal file
42
grammars/c.gbnf
Normal file
@@ -0,0 +1,42 @@
|
||||
root ::= (declaration)*
|
||||
|
||||
declaration ::= dataType identifier "(" parameter? ")" "{" statement* "}"
|
||||
|
||||
dataType ::= "int" ws | "float" ws | "char" ws
|
||||
identifier ::= [a-zA-Z_] [a-zA-Z_0-9]*
|
||||
|
||||
parameter ::= dataType identifier
|
||||
|
||||
statement ::=
|
||||
( dataType identifier ws "=" ws expression ";" ) |
|
||||
( identifier ws "=" ws expression ";" ) |
|
||||
( identifier ws "(" argList? ")" ";" ) |
|
||||
( "return" ws expression ";" ) |
|
||||
( "while" "(" condition ")" "{" statement* "}" ) |
|
||||
( "for" "(" forInit ";" ws condition ";" ws forUpdate ")" "{" statement* "}" ) |
|
||||
( "if" "(" condition ")" "{" statement* "}" ("else" "{" statement* "}")? ) |
|
||||
( singleLineComment ) |
|
||||
( multiLineComment )
|
||||
|
||||
forInit ::= dataType identifier ws "=" ws expression | identifier ws "=" ws expression
|
||||
forUpdate ::= identifier ws "=" ws expression
|
||||
|
||||
condition ::= expression relationOperator expression
|
||||
relationOperator ::= ("<=" | "<" | "==" | "!=" | ">=" | ">")
|
||||
|
||||
expression ::= term (("+" | "-") term)*
|
||||
term ::= factor(("*" | "/") factor)*
|
||||
|
||||
factor ::= identifier | number | unaryTerm | funcCall | parenExpression
|
||||
unaryTerm ::= "-" factor
|
||||
funcCall ::= identifier "(" argList? ")"
|
||||
parenExpression ::= "(" ws expression ws ")"
|
||||
|
||||
argList ::= expression ("," ws expression)*
|
||||
|
||||
number ::= [0-9]+
|
||||
|
||||
singleLineComment ::= "//" [^\n]* "\n"
|
||||
multiLineComment ::= "/*" ( [^*] | ("*" [^/]) )* "*/"
|
||||
|
||||
ws ::= ([ \t\n]+)
|
||||
52
k_quants.c
52
k_quants.c
@@ -13,6 +13,26 @@
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
inline static int32_t vaddvq_s16(int16x8_t v) {
|
||||
return
|
||||
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
||||
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
||||
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
||||
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
||||
}
|
||||
|
||||
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
||||
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
||||
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
||||
return vcombine_s16(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
@@ -183,13 +203,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
||||
int ntry, float alpha) {
|
||||
float min = x[0];
|
||||
float max = x[0];
|
||||
float sum_x = 0;
|
||||
float sum_x2 = 0;
|
||||
for (int i = 1; i < n; ++i) {
|
||||
if (x[i] < min) min = x[i];
|
||||
if (x[i] > max) max = x[i];
|
||||
sum_x += x[i];
|
||||
sum_x2 += x[i]*x[i];
|
||||
}
|
||||
if (max == min) {
|
||||
for (int i = 0; i < n; ++i) L[i] = 0;
|
||||
@@ -1306,7 +1322,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
||||
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
const int32x4_t vzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
|
||||
int8x16x2_t q2bytes;
|
||||
uint8_t aux[16];
|
||||
@@ -1612,7 +1630,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
const int32x4_t vzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
|
||||
int8x16x4_t q2bytes;
|
||||
|
||||
@@ -2060,7 +2080,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
uint32_t *aux;
|
||||
const uint32_t *aux;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
@@ -2070,7 +2090,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
// Set up scales
|
||||
aux = (uint32_t *)x[i].scales;
|
||||
aux = (const uint32_t *)x[i].scales;
|
||||
__m128i scales128 = _mm_set_epi32(
|
||||
((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
|
||||
((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
|
||||
@@ -2596,8 +2616,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
const uint8_t * restrict q4 = x[i].qs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
//int32x4_t isum = mzero;
|
||||
|
||||
int32_t sumi1 = 0;
|
||||
int32_t sumi2 = 0;
|
||||
|
||||
@@ -2694,13 +2712,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
||||
__m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
|
||||
p16l = _mm256_madd_epi16(scale_l, p16l);
|
||||
sumi = _mm256_add_epi32(sumi, p16l);
|
||||
|
||||
const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
||||
__m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
|
||||
p16h = _mm256_madd_epi16(scale_h, p16h);
|
||||
sumi = _mm256_add_epi32(sumi, p16h);
|
||||
const __m256i sumj = _mm256_add_epi32(p16l, p16h);
|
||||
|
||||
sumi = _mm256_add_epi32(sumi, sumj);
|
||||
}
|
||||
|
||||
__m256 vd = _mm256_set1_ps(d);
|
||||
@@ -3096,9 +3114,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||
const int32x4_t mzero = vdupq_n_s32(0);
|
||||
const uint8x16_t mone = vdupq_n_u8(1);
|
||||
const uint8x16_t mtwo = vdupq_n_u8(2);
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
const int32x4_t mzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
|
||||
int8x16x4_t q5bytes;
|
||||
|
||||
@@ -3441,8 +3461,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||
const int32x4_t mzero = vdupq_n_s32(0);
|
||||
const uint8x16_t mh = vdupq_n_u8(16);
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
const int32x4_t mzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
|
||||
int8x16x4_t q5bytes;
|
||||
uint8x16x4_t q5h;
|
||||
@@ -3660,7 +3682,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
float sum = 0;
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
const int32x4_t vzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
//const int8x16_t m32s = vdupq_n_s8(32);
|
||||
|
||||
const uint8x16_t mone = vdupq_n_u8(3);
|
||||
@@ -4049,8 +4073,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
float sum = 0;
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
||||
const int32x4_t vzero = vdupq_n_s32(0);
|
||||
const int8x16_t m32s = vdupq_n_s8(32);
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
const int32x4_t vzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
|
||||
const uint8x16_t mone = vdupq_n_u8(3);
|
||||
|
||||
|
||||
357
llama.cpp
357
llama.cpp
@@ -114,12 +114,17 @@ static size_t utf8_len(char src) {
|
||||
}
|
||||
|
||||
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
for (size_t pos = 0; ; pos += replace.length()) {
|
||||
pos = s.find(search, pos);
|
||||
if (pos == std::string::npos) break;
|
||||
s.erase(pos, search.length());
|
||||
s.insert(pos, replace);
|
||||
std::string result;
|
||||
for (size_t pos = 0; ; pos += search.length()) {
|
||||
auto new_pos = s.find(search, pos);
|
||||
if (new_pos == std::string::npos) {
|
||||
result += s.substr(pos, s.size() - pos);
|
||||
break;
|
||||
}
|
||||
result += s.substr(pos, new_pos - pos) + replace;
|
||||
pos = new_pos;
|
||||
}
|
||||
s = std::move(result);
|
||||
}
|
||||
|
||||
static void zeros(std::ofstream & file, size_t n) {
|
||||
@@ -320,6 +325,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GPT2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GPTJ,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GPTNEOX,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_MPT,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
static llm_arch llm_arch_from_string(const std::string & name) {
|
||||
@@ -606,20 +649,25 @@ struct llama_mmap {
|
||||
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||
if (prefetch) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
WIN32_MEMORY_RANGE_ENTRY range;
|
||||
range.VirtualAddress = addr;
|
||||
range.NumberOfBytes = (SIZE_T)size;
|
||||
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
// PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
|
||||
BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
|
||||
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
||||
|
||||
// may fail on pre-Windows 8 systems
|
||||
pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
|
||||
|
||||
if (pPrefetchVirtualMemory) {
|
||||
// advise the kernel to preload the mapped memory
|
||||
WIN32_MEMORY_RANGE_ENTRY range;
|
||||
range.VirtualAddress = addr;
|
||||
range.NumberOfBytes = (SIZE_T)size;
|
||||
if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
||||
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||
}
|
||||
|
||||
~llama_mmap() {
|
||||
@@ -796,12 +844,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
||||
(void) tensor;
|
||||
}
|
||||
|
||||
static std::string llama_token_to_text(const struct llama_context * ctx, llama_token token) {
|
||||
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
||||
std::vector<char> result(8, 0);
|
||||
const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
|
||||
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_token_to_str(ctx, token, result.data(), result.size());
|
||||
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
@@ -1144,11 +1192,13 @@ static bool llama_kv_cache_init(
|
||||
|
||||
enum llama_fver {
|
||||
GGUF_FILE_VERSION_V1 = 1,
|
||||
GGUF_FILE_VERSION_V2 = 2,
|
||||
};
|
||||
|
||||
static const char * llama_file_version_name(llama_fver version) {
|
||||
switch (version) {
|
||||
case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
|
||||
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
||||
case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
|
||||
}
|
||||
|
||||
return "unknown";
|
||||
@@ -1593,9 +1643,13 @@ static void llm_load_hparams(
|
||||
|
||||
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
|
||||
|
||||
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
|
||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
|
||||
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
||||
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
|
||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
|
||||
}
|
||||
}
|
||||
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
|
||||
// gpt-j n_rot = rotary_dim
|
||||
}
|
||||
|
||||
// arch-specific KVs
|
||||
@@ -1635,7 +1689,8 @@ static void llm_load_hparams(
|
||||
}
|
||||
|
||||
// TODO: This should probably be in llama.h
|
||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos);
|
||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
|
||||
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
||||
|
||||
static void llm_load_vocab(
|
||||
llama_model_loader & ml,
|
||||
@@ -1737,7 +1792,11 @@ static void llm_load_vocab(
|
||||
}
|
||||
|
||||
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
||||
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
||||
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
||||
} else {
|
||||
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
|
||||
}
|
||||
|
||||
// special tokens
|
||||
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
|
||||
@@ -2635,18 +2694,20 @@ static struct ggml_cgraph * llm_build_falcon(
|
||||
|
||||
const size_t wsize = ggml_type_size(cur->type);
|
||||
|
||||
struct ggml_tensor * tmpq = ggml_view_3d(
|
||||
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
||||
// non-contiguous views is added for the rope operator
|
||||
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
||||
ctx0, cur, n_embd_head, n_head, N,
|
||||
wsize * n_embd_head,
|
||||
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
||||
0);
|
||||
0));
|
||||
offload_func_kq(tmpq);
|
||||
|
||||
struct ggml_tensor * tmpk = ggml_view_3d(
|
||||
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
||||
ctx0, cur, n_embd_head, n_head_kv, N,
|
||||
wsize * n_embd_head,
|
||||
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
||||
wsize * n_embd_head * n_head);
|
||||
wsize * n_embd_head * n_head));
|
||||
offload_func_kq(tmpk);
|
||||
|
||||
struct ggml_tensor * tmpv = ggml_view_3d(
|
||||
@@ -2831,7 +2892,6 @@ static bool llama_eval_internal(
|
||||
|
||||
GGML_ASSERT(n_tokens > 0);
|
||||
GGML_ASSERT(n_past >= 0);
|
||||
GGML_ASSERT(n_threads > 0);
|
||||
// TODO: keep the values of n_batch and n_ctx
|
||||
// GGML_ASSERT(n_tokens <= n_batch);
|
||||
// GGML_ASSERT(n_past + n_tokens <= n_ctx);
|
||||
@@ -2842,6 +2902,8 @@ static bool llama_eval_internal(
|
||||
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
||||
#endif
|
||||
|
||||
GGML_ASSERT(n_threads > 0);
|
||||
|
||||
const int N = n_tokens;
|
||||
|
||||
const auto & model = lctx.model;
|
||||
@@ -3026,10 +3088,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
||||
return vocab.token_to_id.at(buf);
|
||||
}
|
||||
|
||||
static std::string llama_escape_whitespace(const std::string& text) {
|
||||
std::string result = text;
|
||||
replace_all(result, " ", "\xe2\x96\x81");
|
||||
return result;
|
||||
static void llama_escape_whitespace(std::string & text) {
|
||||
replace_all(text, " ", "\xe2\x96\x81");
|
||||
}
|
||||
|
||||
static void llama_unescape_whitespace(std::string & word) {
|
||||
@@ -3198,7 +3258,7 @@ private:
|
||||
|
||||
struct llm_bigram_bpe {
|
||||
struct comparator {
|
||||
bool operator()(llm_bigram_bpe & l, llm_bigram_bpe & r) {
|
||||
bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
|
||||
return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
|
||||
}
|
||||
};
|
||||
@@ -3306,9 +3366,15 @@ struct llm_tokenizer_bpe {
|
||||
std::string byte_str(1, *j);
|
||||
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
||||
if (token_multibyte == vocab.token_to_id.end()) {
|
||||
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
|
||||
try {
|
||||
llama_token token_byte = llama_byte_to_token(vocab, *j);
|
||||
output.push_back(token_byte);
|
||||
} catch (const std::out_of_range & err) {
|
||||
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
|
||||
}
|
||||
} else {
|
||||
output.push_back((*token_multibyte).second);
|
||||
}
|
||||
output.push_back((*token_multibyte).second);
|
||||
}
|
||||
} else {
|
||||
output.push_back((*token).second);
|
||||
@@ -3346,23 +3412,22 @@ private:
|
||||
}
|
||||
|
||||
// probably not 100% correct
|
||||
// TODO: this is quite slow - how to make it more efficient?
|
||||
static std::vector<std::string> bpe_gpt2_preprocess(std::string text) {
|
||||
static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
||||
std::vector<std::string> words;
|
||||
|
||||
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
||||
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
||||
const std::regex re(pattern);
|
||||
std::smatch m;
|
||||
|
||||
while (std::regex_search(text, m, re)) {
|
||||
for (auto x : m) {
|
||||
words.push_back(x);
|
||||
}
|
||||
text = m.suffix();
|
||||
auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
|
||||
auto words_end = std::sregex_iterator();
|
||||
auto n_words = std::distance(words_begin, words_end);
|
||||
words.reserve(n_words);
|
||||
for (auto it = words_begin; it != words_end; ++it) {
|
||||
words.push_back(it->str());
|
||||
}
|
||||
|
||||
return words;
|
||||
|
||||
}
|
||||
|
||||
const llama_vocab & vocab;
|
||||
@@ -3373,22 +3438,31 @@ private:
|
||||
llm_bigram_bpe::queue work_queue;
|
||||
};
|
||||
|
||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos) {
|
||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
|
||||
std::vector<llama_vocab::id> output;
|
||||
|
||||
if (raw_text.empty()) {
|
||||
return output;
|
||||
}
|
||||
// OG tokenizer behavior:
|
||||
//
|
||||
// tokenizer.encode('', add_bos=True) returns [1]
|
||||
// tokenizer.encode('', add_bos=False) returns []
|
||||
|
||||
if (bos && vocab.special_bos_id != -1) {
|
||||
output.push_back(vocab.special_bos_id);
|
||||
}
|
||||
|
||||
if (raw_text.empty()) {
|
||||
return output;
|
||||
}
|
||||
|
||||
switch (vocab.type) {
|
||||
case LLAMA_VOCAB_TYPE_SPM:
|
||||
{
|
||||
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
||||
raw_text = " " + raw_text;
|
||||
|
||||
llm_tokenizer_spm tokenizer(vocab);
|
||||
tokenizer.tokenize(llama_escape_whitespace(raw_text), output);
|
||||
llama_escape_whitespace(raw_text);
|
||||
tokenizer.tokenize(raw_text, output);
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_BPE:
|
||||
{
|
||||
@@ -3574,7 +3648,7 @@ static void llama_grammar_advance_stack(
|
||||
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
||||
|
||||
if (stack.empty()) {
|
||||
new_stacks.push_back(stack);
|
||||
new_stacks.emplace_back(stack);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -3611,7 +3685,7 @@ static void llama_grammar_advance_stack(
|
||||
}
|
||||
case LLAMA_GRETYPE_CHAR:
|
||||
case LLAMA_GRETYPE_CHAR_NOT:
|
||||
new_stacks.push_back(stack);
|
||||
new_stacks.emplace_back(stack);
|
||||
break;
|
||||
default:
|
||||
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
||||
@@ -4078,16 +4152,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
||||
std::vector<llama_grammar_candidate> candidates_grammar;
|
||||
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
const llama_token id = candidates->data[i].id;
|
||||
const std::string text = llama_token_to_text(ctx, id);
|
||||
const llama_token id = candidates->data[i].id;
|
||||
const std::string piece = llama_token_to_str(ctx, id);
|
||||
if (id == eos) {
|
||||
if (!allow_eos) {
|
||||
candidates->data[i].logit = -INFINITY;
|
||||
}
|
||||
} else if (text.empty() || text[0] == 0) {
|
||||
} else if (piece.empty() || piece[0] == 0) {
|
||||
candidates->data[i].logit = -INFINITY;
|
||||
} else {
|
||||
candidates_decoded.push_back(decode_utf8(text.c_str(), grammar->partial_utf8));
|
||||
candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
|
||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
||||
}
|
||||
}
|
||||
@@ -4291,10 +4365,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
const std::string text = llama_token_to_text(ctx, token);
|
||||
const std::string piece = llama_token_to_str(ctx, token);
|
||||
|
||||
// Note terminating 0 in decoded string
|
||||
const auto decoded = decode_utf8(text.c_str(), grammar->partial_utf8);
|
||||
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
||||
const auto & code_points = decoded.first;
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
||||
@@ -4367,7 +4441,7 @@ struct llama_logit_info {
|
||||
}
|
||||
return min_heap;
|
||||
}
|
||||
float probability_from_logit(float logit) {
|
||||
float probability_from_logit(float logit) const {
|
||||
return normalizer * std::exp(logit - max_l);
|
||||
}
|
||||
};
|
||||
@@ -4657,6 +4731,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
llm_load_arch(*ml, model);
|
||||
llm_load_hparams(*ml, model, 0, 0, 0);
|
||||
|
||||
if (params->only_copy) {
|
||||
ftype = model.ftype;
|
||||
}
|
||||
|
||||
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
||||
struct gguf_context * ctx_out = gguf_init_empty();
|
||||
|
||||
@@ -4743,18 +4821,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
// quantize only 2D tensors
|
||||
quantize &= (tensor->n_dims == 2);
|
||||
quantize &= params->quantize_output_tensor || name != "output.weight";
|
||||
quantize &= quantized_type != tensor->type;
|
||||
quantize &= !params->only_copy;
|
||||
|
||||
enum ggml_type new_type;
|
||||
void * new_data;
|
||||
size_t new_size;
|
||||
|
||||
if (!quantize) {
|
||||
new_type = tensor->type;
|
||||
new_data = tensor->data;
|
||||
new_size = ggml_nbytes(tensor);
|
||||
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
} else {
|
||||
if (quantize) {
|
||||
new_type = quantized_type;
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||
@@ -4762,7 +4835,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
||||
int nx = tensor->ne[0];
|
||||
if (nx % QK_K == 0) {
|
||||
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
else if (new_type != GGML_TYPE_Q8_0) {
|
||||
new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||
@@ -4786,17 +4862,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
||||
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
||||
: GGML_TYPE_Q3_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
||||
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
||||
if (model.arch == LLM_ARCH_FALCON) {
|
||||
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
||||
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
} else {
|
||||
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
|
||||
++i_feed_forward_w2;
|
||||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
if (model.arch != LLM_ARCH_FALCON) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
} else {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
}
|
||||
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
@@ -4828,7 +4926,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// If we've decided to quantize to the same type the tensor is already
|
||||
// in then there's nothing to do.
|
||||
quantize = tensor->type != new_type;
|
||||
}
|
||||
if (!quantize) {
|
||||
new_type = tensor->type;
|
||||
new_data = tensor->data;
|
||||
new_size = ggml_nbytes(tensor);
|
||||
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
} else {
|
||||
const size_t nelements = ggml_nelements(tensor);
|
||||
|
||||
float * f32_data;
|
||||
@@ -5233,7 +5340,7 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
||||
/*.n_ctx =*/ 512,
|
||||
/*.n_batch =*/ 512,
|
||||
/*.gpu_layers =*/ 0,
|
||||
/*.n_gpu_layers =*/ 0,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.tensor_split =*/ nullptr,
|
||||
/*.rope_freq_base =*/ 10000.0f,
|
||||
@@ -5241,7 +5348,7 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.progress_callback =*/ nullptr,
|
||||
/*.progress_callback_user_data =*/ nullptr,
|
||||
/*.low_vram =*/ false,
|
||||
/*.mul_mat_q =*/ false,
|
||||
/*.mul_mat_q =*/ true,
|
||||
/*.f16_kv =*/ true,
|
||||
/*.logits_all =*/ false,
|
||||
/*.vocab_only =*/ false,
|
||||
@@ -5250,6 +5357,10 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.embedding =*/ false,
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
result.n_gpu_layers = 1;
|
||||
#endif
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -5259,6 +5370,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||
/*.allow_requantize =*/ false,
|
||||
/*.quantize_output_tensor =*/ true,
|
||||
/*.only_copy =*/ false,
|
||||
};
|
||||
|
||||
return result;
|
||||
@@ -5441,43 +5553,43 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
if (params.n_gpu_layers > 0) {
|
||||
// this allocates all Metal resources and memory buffers
|
||||
if (params.n_gpu_layers > 0) {
|
||||
// this allocates all Metal resources and memory buffers
|
||||
|
||||
void * data_ptr = NULL;
|
||||
size_t data_size = 0;
|
||||
void * data_ptr = NULL;
|
||||
size_t data_size = 0;
|
||||
|
||||
if (params.use_mmap) {
|
||||
data_ptr = ctx->model.mapping->addr;
|
||||
data_size = ctx->model.mapping->size;
|
||||
} else {
|
||||
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
||||
data_size = ggml_get_mem_size (ctx->model.ctx);
|
||||
}
|
||||
if (params.use_mmap) {
|
||||
data_ptr = ctx->model.mapping->addr;
|
||||
data_size = ctx->model.mapping->size;
|
||||
} else {
|
||||
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
||||
data_size = ggml_get_mem_size (ctx->model.ctx);
|
||||
}
|
||||
|
||||
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
||||
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
||||
|
||||
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
||||
|
||||
#define LLAMA_METAL_CHECK_BUF(result) \
|
||||
if (!(result)) { \
|
||||
LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
|
||||
llama_free(ctx); \
|
||||
return NULL; \
|
||||
}
|
||||
if (!(result)) { \
|
||||
LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
|
||||
llama_free(ctx); \
|
||||
return NULL; \
|
||||
}
|
||||
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
||||
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
||||
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
||||
#undef LLAMA_METAL_CHECK_BUF
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ctx->ctx_mpi = ggml_mpi_init();
|
||||
@@ -6101,12 +6213,12 @@ int llama_tokenize_with_model(
|
||||
return res.size();
|
||||
}
|
||||
|
||||
int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
||||
return llama_token_to_str_with_model(&ctx->model, token, buf, length);
|
||||
int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
||||
return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
|
||||
}
|
||||
|
||||
// does not write null-terminator to str
|
||||
int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
|
||||
// does not write null-terminator to buf
|
||||
int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
|
||||
if (0 <= token && token < llama_model_n_vocab(model)) {
|
||||
if (llama_is_normal_token(model->vocab, token)) {
|
||||
std::string result = model->vocab.id_to_token[token].text;
|
||||
@@ -6194,11 +6306,40 @@ const char * llama_print_system_info(void) {
|
||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||
|
||||
return s.c_str();
|
||||
}
|
||||
|
||||
void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
||||
fprintf(stream, "\n");
|
||||
fprintf(stream, "###########\n");
|
||||
fprintf(stream, "# Timings #\n");
|
||||
fprintf(stream, "###########\n");
|
||||
fprintf(stream, "\n");
|
||||
|
||||
fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
|
||||
1.0e-3 * ctx->t_eval_us / ctx->n_eval);
|
||||
fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
|
||||
1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
|
||||
fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
|
||||
1.0e-3 * ctx->t_sample_us / ctx->n_sample);
|
||||
fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
|
||||
fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
|
||||
fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->n_sample);
|
||||
fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
|
||||
fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
|
||||
fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
|
||||
fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->t_sample_us);
|
||||
fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
|
||||
1.0e6 * ctx->n_eval / ctx->t_eval_us);
|
||||
fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
|
||||
1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
|
||||
fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
|
||||
1.0e6 * ctx->n_sample / ctx->t_sample_us);
|
||||
}
|
||||
|
||||
// For internal test use
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
||||
return ctx->model.tensors_by_name;
|
||||
@@ -6209,10 +6350,6 @@ void llama_log_set(llama_log_callback log_callback, void * user_data) {
|
||||
g_state.log_callback_user_data = user_data;
|
||||
}
|
||||
|
||||
#if defined(_MSC_VER) && !defined(vsnprintf)
|
||||
#define vsnprintf _vsnprintf
|
||||
#endif
|
||||
|
||||
static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
|
||||
va_list args_copy;
|
||||
va_copy(args_copy, args);
|
||||
|
||||
16
llama.h
16
llama.h
@@ -10,6 +10,7 @@
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
@@ -163,6 +164,7 @@ extern "C" {
|
||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
bool quantize_output_tensor; // quantize output.weight
|
||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
} llama_model_quantize_params;
|
||||
|
||||
// grammar types
|
||||
@@ -381,15 +383,17 @@ extern "C" {
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
// Token Id -> String. Uses the vocabulary in the provided context
|
||||
// Does not write null terminator to the buffer
|
||||
LLAMA_API int llama_token_to_str(
|
||||
// Token Id -> Piece.
|
||||
// Uses the vocabulary in the provided context.
|
||||
// Does not write null terminator to the buffer.
|
||||
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
||||
LLAMA_API int llama_token_to_piece(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int length);
|
||||
|
||||
LLAMA_API int llama_token_to_str_with_model(
|
||||
LLAMA_API int llama_token_to_piece_with_model(
|
||||
const struct llama_model * model,
|
||||
llama_token token,
|
||||
char * buf,
|
||||
@@ -494,7 +498,7 @@ extern "C" {
|
||||
// Type of pointer to the beam_search_callback function.
|
||||
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
||||
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
||||
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, llama_beams_state);
|
||||
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
|
||||
|
||||
/// @details Deterministically returns entire sentence constructed by a beam search.
|
||||
/// @param ctx Pointer to the llama_context.
|
||||
@@ -518,6 +522,8 @@ extern "C" {
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
||||
|
||||
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
5
mypy.ini
Normal file
5
mypy.ini
Normal file
@@ -0,0 +1,5 @@
|
||||
[mypy]
|
||||
strict = true
|
||||
allow_untyped_calls = true
|
||||
allow_untyped_defs = true
|
||||
allow_incomplete_defs = true
|
||||
140
run_with_preset.py
Executable file
140
run_with_preset.py
Executable file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import yaml
|
||||
|
||||
CLI_ARGS_MAIN_PERPLEXITY = [
|
||||
"batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
|
||||
"export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
|
||||
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
|
||||
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
|
||||
"low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
|
||||
"model", "mtest", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
|
||||
"np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
|
||||
"prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
|
||||
"repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
|
||||
"simple-io", "tensor-split", "threads", "temp", "tfs", "top-k", "top-p", "typical",
|
||||
"verbose-prompt"
|
||||
]
|
||||
|
||||
CLI_ARGS_LLAMA_BENCH = [
|
||||
"batch-size", "memory-f32", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
|
||||
"n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose"
|
||||
]
|
||||
|
||||
CLI_ARGS_SERVER = [
|
||||
"alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
|
||||
"low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
|
||||
"numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
|
||||
"threads", "verbose"
|
||||
]
|
||||
|
||||
description = """Run llama.cpp binaries with presets from YAML file(s).
|
||||
To specify which binary should be run, specify the "binary" property (main, perplexity, llama-bench, and server are supported).
|
||||
To get a preset file template, run a llama.cpp binary with the "--logdir" CLI argument.
|
||||
|
||||
Formatting considerations:
|
||||
- The YAML property names are the same as the CLI argument names of the corresponding binary.
|
||||
- Properties must use the long name of their corresponding llama.cpp CLI arguments.
|
||||
- Like the llama.cpp binaries the property names do not differentiate between hyphens and underscores.
|
||||
- Flags must be defined as "<PROPERTY_NAME>: true" to be effective.
|
||||
- To define the logit_bias property, the expected format is "<TOKEN_ID>: <BIAS>" in the "logit_bias" namespace.
|
||||
- To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings.
|
||||
- To define a tensor split, pass a list of floats.
|
||||
"""
|
||||
usage = "run_with_preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
|
||||
epilog = (" --<ARG_NAME> specify additional CLI ars to be passed to the binary (override all preset files). "
|
||||
"Unknown args will be ignored.")
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter)
|
||||
parser.add_argument("-bin", "--binary", help="The binary to run.")
|
||||
parser.add_argument("yaml_files", nargs="*",
|
||||
help="Arbitrary number of YAML files from which to read preset values. "
|
||||
"If two files specify the same values the later one will be used.")
|
||||
|
||||
known_args, unknown_args = parser.parse_known_args()
|
||||
|
||||
if not known_args.yaml_files and not unknown_args:
|
||||
parser.print_help()
|
||||
sys.exit(0)
|
||||
|
||||
props = dict()
|
||||
|
||||
for yaml_file in known_args.yaml_files:
|
||||
with open(yaml_file, "r") as f:
|
||||
props.update(yaml.load(f, yaml.SafeLoader))
|
||||
|
||||
props = {prop.replace("_", "-"): val for prop, val in props.items()}
|
||||
|
||||
binary = props.pop("binary", "main")
|
||||
if known_args.binary:
|
||||
binary = known_args.binary
|
||||
|
||||
if os.path.exists(f"./{binary}"):
|
||||
binary = f"./{binary}"
|
||||
|
||||
if binary.lower().endswith("main") or binary.lower().endswith("perplexity"):
|
||||
cli_args = CLI_ARGS_MAIN_PERPLEXITY
|
||||
elif binary.lower().endswith("llama-bench"):
|
||||
cli_args = CLI_ARGS_LLAMA_BENCH
|
||||
elif binary.lower().endswith("server"):
|
||||
cli_args = CLI_ARGS_SERVER
|
||||
else:
|
||||
print(f"Unknown binary: {binary}")
|
||||
sys.exit(1)
|
||||
|
||||
command_list = [binary]
|
||||
|
||||
for cli_arg in cli_args:
|
||||
value = props.pop(cli_arg, None)
|
||||
|
||||
if not value or value == -1:
|
||||
continue
|
||||
|
||||
if cli_arg == "logit-bias":
|
||||
for token, bias in value.items():
|
||||
command_list.append("--logit-bias")
|
||||
command_list.append(f"{token}{bias:+}")
|
||||
continue
|
||||
|
||||
if cli_arg == "reverse-prompt" and not isinstance(value, str):
|
||||
for rp in value:
|
||||
command_list.append("--reverse-prompt")
|
||||
command_list.append(str(rp))
|
||||
continue
|
||||
|
||||
command_list.append(f"--{cli_arg}")
|
||||
|
||||
if cli_arg == "tensor-split":
|
||||
command_list.append(",".join([str(v) for v in value]))
|
||||
continue
|
||||
|
||||
value = str(value)
|
||||
|
||||
if value != "True":
|
||||
command_list.append(str(value))
|
||||
|
||||
num_unused = len(props)
|
||||
if num_unused > 10:
|
||||
print(f"The preset file contained a total of {num_unused} unused properties.")
|
||||
elif num_unused > 0:
|
||||
print("The preset file contained the following unused properties:")
|
||||
for prop, value in props.items():
|
||||
print(f" {prop}: {value}")
|
||||
|
||||
command_list += unknown_args
|
||||
|
||||
sp = subprocess.Popen(command_list)
|
||||
|
||||
while sp.returncode is None:
|
||||
try:
|
||||
sp.wait()
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
sys.exit(sp.returncode)
|
||||
26
scripts/convert-gg.sh
Executable file
26
scripts/convert-gg.sh
Executable file
@@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# LLaMA v1
|
||||
python3 convert.py ../llama1/7B --outfile models/llama-7b/ggml-model-f16.gguf --outtype f16
|
||||
python3 convert.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16
|
||||
python3 convert.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16
|
||||
python3 convert.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16
|
||||
|
||||
# LLaMA v2
|
||||
python3 convert.py ../llama2/llama-2-7b --outfile models/llama-7b-v2/ggml-model-f16.gguf --outtype f16
|
||||
python3 convert.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16
|
||||
python3 convert.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16
|
||||
|
||||
# Code Llama
|
||||
python3 convert.py ../codellama/CodeLlama-7b/ --outfile models/codellama-7b/ggml-model-f16.gguf --outtype f16
|
||||
python3 convert.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16
|
||||
python3 convert.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16
|
||||
|
||||
# Falcon
|
||||
python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-7b 1
|
||||
mv -v ../falcon/falcon-7b/ggml-model-f16.gguf models/falcon-7b/ggml-model-f16.gguf
|
||||
|
||||
python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-40b 1
|
||||
mv -v ../falcon/falcon-40b/ggml-model-f16.gguf models/falcon-40b/ggml-model-f16.gguf
|
||||
@@ -20,6 +20,9 @@ fi
|
||||
model="$1"
|
||||
out="../tmp/results-${model}"
|
||||
|
||||
set -o pipefail
|
||||
set -e
|
||||
|
||||
mkdir -p ${out}
|
||||
|
||||
for q in ${qnt[@]}; do
|
||||
|
||||
@@ -20,6 +20,9 @@ fi
|
||||
model="$1"
|
||||
out="../tmp/results-${model}"
|
||||
|
||||
set -o pipefail
|
||||
set -e
|
||||
|
||||
mkdir -p ${out}
|
||||
|
||||
mstr=""
|
||||
|
||||
@@ -17,6 +17,9 @@ if [ ! -z "$3" ]; then
|
||||
args="$3"
|
||||
fi
|
||||
|
||||
set -o pipefail
|
||||
set -e
|
||||
|
||||
model="$1"
|
||||
out="../tmp/results-${model}"
|
||||
|
||||
|
||||
@@ -25,8 +25,10 @@ endfunction()
|
||||
llama_build_and_test_executable(test-quantize-fns.cpp)
|
||||
llama_build_and_test_executable(test-quantize-perf.cpp)
|
||||
llama_build_and_test_executable(test-sampling.cpp)
|
||||
llama_build_executable(test-tokenizer-0.cpp)
|
||||
llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
llama_build_executable(test-tokenizer-0-llama.cpp)
|
||||
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
||||
#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_build_executable(test-tokenizer-1.cpp)
|
||||
# test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
|
||||
#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
@@ -35,3 +37,8 @@ llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
||||
# llama_build_and_test_executable(test-opt.cpp) # SLOW
|
||||
|
||||
# dummy executable - not installed
|
||||
get_filename_component(TEST_TARGET test-c.c NAME_WE)
|
||||
add_executable(${TEST_TARGET} test-c.c)
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE llama)
|
||||
|
||||
3
tests/test-c.c
Normal file
3
tests/test-c.c
Normal file
@@ -0,0 +1,3 @@
|
||||
#include "llama.h"
|
||||
|
||||
int main(void) {}
|
||||
@@ -275,14 +275,14 @@ static bool check_gradient(
|
||||
|
||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||
|
||||
const float f0 = ggml_get_f32_1d(f, 0);
|
||||
const double f0 = ggml_get_f32_1d(f, 0);
|
||||
|
||||
ggml_set_f32_1d(x[i], k, xm);
|
||||
|
||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||
|
||||
const float f1 = ggml_get_f32_1d(f, 0);
|
||||
const float g0 = (f0 - f1)/(2.0f*eps);
|
||||
const double f1 = ggml_get_f32_1d(f, 0);
|
||||
const double g0 = (f0 - f1)/(2.0*(double) eps);
|
||||
|
||||
ggml_set_f32_1d(x[i], k, x0);
|
||||
|
||||
@@ -292,10 +292,10 @@ static bool check_gradient(
|
||||
|
||||
ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
|
||||
|
||||
const float g1 = ggml_get_f32_1d(x[i]->grad, k);
|
||||
const double g1 = ggml_get_f32_1d(x[i]->grad, k);
|
||||
|
||||
const float error_abs = fabsf(g0 - g1);
|
||||
const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
|
||||
const double error_abs = fabs(g0 - g1);
|
||||
const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;
|
||||
|
||||
if (error_abs > max_error_abs || error_rel > max_error_rel) {
|
||||
printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
|
||||
@@ -531,7 +531,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
|
||||
|
||||
check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
|
||||
check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1345,9 +1345,18 @@ int main(int argc, const char ** argv) {
|
||||
x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
|
||||
ggml_set_param(ctx0, x[0]);
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0]));
|
||||
float eps = 1e-6f;
|
||||
// dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
|
||||
// instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
|
||||
struct ggml_tensor * f = ggml_sum(ctx0,
|
||||
ggml_log(ctx0,
|
||||
ggml_add1(ctx0,
|
||||
ggml_scale(ctx0,
|
||||
ggml_soft_max(ctx0, x[0]),
|
||||
ggml_new_f32(ctx0, 1.0f - eps)),
|
||||
ggml_new_f32(ctx0, eps))));
|
||||
|
||||
check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1358,15 +1367,26 @@ int main(int argc, const char ** argv) {
|
||||
int64_t ne2[4];
|
||||
get_random_dims(ne2, 4);
|
||||
|
||||
for (int ndims = 1; ndims <= 3; ++ndims) {
|
||||
x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
|
||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||
x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
|
||||
x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
|
||||
// the second argument to cross_entropy_loss must sum up to 1 for each row
|
||||
int nr = ggml_nrows(x[1]);
|
||||
int nc = ggml_nelements(x[1]) / nr;
|
||||
for (int ir = 0; ir < nr; ++ir) {
|
||||
float sum = 0;
|
||||
for (int ic = 0; ic < nc; ++ic) {
|
||||
sum += ((float *) x[1]->data)[ic + ir*nc];
|
||||
}
|
||||
for (int ic = 0; ic < nc; ++ic) {
|
||||
((float *) x[1]->data)[ic + ir*nc] /= sum;
|
||||
}
|
||||
}
|
||||
ggml_set_param(ctx0, x[0]);
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
|
||||
struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
|
||||
|
||||
check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY);
|
||||
// finite differences regularly fails!
|
||||
check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1473,7 +1493,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
|
||||
|
||||
check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
|
||||
check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1514,7 +1534,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
|
||||
|
||||
check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
|
||||
check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
178
tests/test-tokenizer-0-falcon.cpp
Normal file
178
tests/test-tokenizer-0-falcon.cpp
Normal file
@@ -0,0 +1,178 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
|
||||
// generate using test-tokenizer-0-falcon.py
|
||||
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||
{ "" , { }, },
|
||||
{ " " , { 204, }, },
|
||||
{ " " , { 258, }, },
|
||||
{ " " , { 466, }, },
|
||||
{ "\t" , { 192, }, },
|
||||
{ "\n" , { 193, }, },
|
||||
{ "\t\n" , { 19125, }, },
|
||||
{ "Hello world" , { 9856, 1079, }, },
|
||||
{ " Hello world" , { 23090, 1079, }, },
|
||||
{ "Hello World" , { 9856, 2889, }, },
|
||||
{ " Hello World" , { 23090, 2889, }, },
|
||||
{ " Hello World!" , { 23090, 2889, 12, }, },
|
||||
{ "Hello, world!" , { 9856, 23, 1079, 12, }, },
|
||||
{ " Hello, world!" , { 23090, 23, 1079, 12, }, },
|
||||
{ " this is 🦙.cpp" , { 414, 304, 3346, 111, 231, 25, 29247, }, },
|
||||
{ "w048 7tuijk dsdfhu" , { 98, 55866, 204, 34, 16682, 7149, 36190, 6869, 11481, }, },
|
||||
{ "нещо на Български" , { 150, 133, 6207, 151, 215, 150, 134, 5052, 133, 6279, 5052, 223, 151, 216, 49679, 123, 53110, 47043, 7795, }, },
|
||||
{ "កាន់តែពិសេសអាចខលចេញ" , { 38154, 206, 38154, 126, 38154, 225, 167, 237, 217, 38154, 221, 167, 237, 208, 38154, 228, 38154, 127, 38154, 237, 167, 237, 207, 38154, 237, 38154, 107, 38154, 126, 38154, 211, 38154, 207, 38154, 233, 38154, 211, 167, 237, 207, 38154, 215, }, },
|
||||
{ "🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 2571, 232, 206, 204, 19, 11003, 20, 8196, 126, 283, 219, 48778, 116, 13392, 204, 19, 51831, 732, 63209, 1741, 7955, 522, 20, 22438, 211, 204, 19, 7927, 53360, 325, 504, 701, 946, 10930, 20, }, },
|
||||
{ "Hello" , { 9856, }, },
|
||||
{ " Hello" , { 23090, }, },
|
||||
{ " Hello" , { 204, 23090, }, },
|
||||
{ " Hello" , { 258, 23090, }, },
|
||||
{ " Hello" , { 466, 23090, }, },
|
||||
{ " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
|
||||
};
|
||||
|
||||
return _k_tests;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const std::string fname = argv[1];
|
||||
|
||||
std::string fname_text;
|
||||
if (argc > 2) {
|
||||
fname_text = argv[2];
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
llama_backend_init(false);
|
||||
|
||||
// load the vocab
|
||||
{
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.vocab_only = true;
|
||||
|
||||
model = llama_load_model_from_file(fname.c_str(), lparams);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx = llama_new_context_with_model(model, lparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_BPE) {
|
||||
fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
return 2;
|
||||
}
|
||||
|
||||
bool success = true;
|
||||
|
||||
for (const auto & test_kv : k_tests()) {
|
||||
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, false);
|
||||
|
||||
printf("\n");
|
||||
printf("src: '%s'\n", test_kv.first.c_str());
|
||||
printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
|
||||
printf("tok: ");
|
||||
for (const auto & tok : res) {
|
||||
printf("%d ", tok);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
bool correct = res.size() == test_kv.second.size();
|
||||
|
||||
for (int i = 0; i < (int) res.size() && correct; ++i) {
|
||||
if (test_kv.second[i] != res[i]) {
|
||||
correct = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!correct) {
|
||||
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
||||
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
||||
llama_detokenize_bpe(ctx, res).c_str(),
|
||||
llama_detokenize_bpe(ctx, test_kv.second).c_str());
|
||||
fprintf(stderr, "%s : expected tokens: ", __func__);
|
||||
for (const auto & t : test_kv.second) {
|
||||
fprintf(stderr, "%6d, ", t);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s : got tokens: ", __func__);
|
||||
for (const auto & t : res) {
|
||||
fprintf(stderr, "%6d, ", t);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!fname_text.empty()) {
|
||||
fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
|
||||
|
||||
std::string text;
|
||||
{
|
||||
std::ifstream ifs(fname_text);
|
||||
if (!ifs) {
|
||||
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
|
||||
return 1;
|
||||
}
|
||||
text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
|
||||
|
||||
const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
|
||||
|
||||
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
|
||||
|
||||
{
|
||||
const std::string fname_out = fname_text + ".tokcpp";
|
||||
|
||||
std::ofstream ofs(fname_out);
|
||||
if (!ofs) {
|
||||
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (const auto & tok : res) {
|
||||
ofs << tok << " ";
|
||||
}
|
||||
|
||||
ofs << "\n";
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return success ? 0 : 3;
|
||||
}
|
||||
83
tests/test-tokenizer-0-falcon.py
Normal file
83
tests/test-tokenizer-0-falcon.py
Normal file
@@ -0,0 +1,83 @@
|
||||
# tests with BPE tokenizer
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
|
||||
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
|
||||
args = parser.parse_args()
|
||||
|
||||
dir_tokenizer = args.dir_tokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
|
||||
|
||||
tests = [
|
||||
"",
|
||||
" ",
|
||||
" ",
|
||||
" ",
|
||||
"\t",
|
||||
"\n",
|
||||
"\t\n",
|
||||
"Hello world",
|
||||
" Hello world",
|
||||
"Hello World",
|
||||
" Hello World",
|
||||
" Hello World!",
|
||||
"Hello, world!",
|
||||
" Hello, world!",
|
||||
" this is 🦙.cpp",
|
||||
"w048 7tuijk dsdfhu",
|
||||
"нещо на Български",
|
||||
"កាន់តែពិសេសអាចខលចេញ",
|
||||
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||
"Hello",
|
||||
" Hello",
|
||||
" Hello",
|
||||
" Hello",
|
||||
" Hello",
|
||||
" Hello\n Hello",
|
||||
]
|
||||
|
||||
for text in tests:
|
||||
print('text: ', text)
|
||||
print(tokenizer.encode(text))
|
||||
print(tokenizer.decode(tokenizer.encode(text)))
|
||||
|
||||
print("\n\ntests for C++:\n")
|
||||
for text in tests:
|
||||
res = tokenizer.encode(text)
|
||||
|
||||
k = text.replace('\n', '\\n')
|
||||
k = k.replace('\t', '\\t')
|
||||
k = '"' + k + '"'
|
||||
print("{ %-24s, { " % k, end='')
|
||||
for x in res:
|
||||
print("%7d," % x, end='')
|
||||
print(" }, },")
|
||||
|
||||
print(tokenizer.encode('hello'))
|
||||
print(tokenizer.encode('world'))
|
||||
print(tokenizer.encode(' world'))
|
||||
print(tokenizer.encode('hello world'))
|
||||
|
||||
fname_tok = args.fname_tok
|
||||
if fname_tok:
|
||||
print('tokenizing file: ', fname_tok)
|
||||
fname_out = fname_tok + '.tok'
|
||||
with open(fname_tok, 'r') as f:
|
||||
lines = f.readlines()
|
||||
s = ''.join(lines)
|
||||
res = tokenizer.encode(s)
|
||||
# write to file
|
||||
with open(fname_out, 'w') as f:
|
||||
for x in res:
|
||||
f.write(str(x) + ' ')
|
||||
f.write('\n')
|
||||
print('len(res): ', len(res))
|
||||
print('len(lines): ', len(lines))
|
||||
print('results written to: ', fname_out)
|
||||
182
tests/test-tokenizer-0-llama.cpp
Normal file
182
tests/test-tokenizer-0-llama.cpp
Normal file
@@ -0,0 +1,182 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
|
||||
// generate using test-tokenizer-0-llama.py
|
||||
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||
{ "" , { }, },
|
||||
{ " " , { 259, }, },
|
||||
{ " " , { 1678, }, },
|
||||
{ " " , { 268, }, },
|
||||
{ "\t" , { 29871, 12, }, },
|
||||
{ "\n" , { 29871, 13, }, },
|
||||
{ "\t\n" , { 29871, 12, 13, }, },
|
||||
{ "Hello world" , { 15043, 3186, }, },
|
||||
{ " Hello world" , { 29871, 15043, 3186, }, },
|
||||
{ "Hello World" , { 15043, 2787, }, },
|
||||
{ " Hello World" , { 29871, 15043, 2787, }, },
|
||||
{ " Hello World!" , { 29871, 15043, 2787, 29991, }, },
|
||||
{ "Hello, world!" , { 15043, 29892, 3186, 29991, }, },
|
||||
{ " Hello, world!" , { 29871, 15043, 29892, 3186, 29991, }, },
|
||||
{ " this is 🦙.cpp" , { 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
||||
{ "w048 7tuijk dsdfhu" , { 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
||||
{ "нещо на Български" , { 1538, 4851, 665, 1386, 29713, 1305, }, },
|
||||
{ "កាន់តែពិសេសអាចខលចេញ" , { 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, 136, 228, 162, 132, 228, 161, 140, }, },
|
||||
{ "🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
|
||||
{ "Hello" , { 15043, }, },
|
||||
{ " Hello" , { 29871, 15043, }, },
|
||||
{ " Hello" , { 259, 15043, }, },
|
||||
{ " Hello" , { 1678, 15043, }, },
|
||||
{ " Hello" , { 268, 15043, }, },
|
||||
{ " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, },
|
||||
};
|
||||
|
||||
return _k_tests;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const std::string fname = argv[1];
|
||||
|
||||
std::string fname_text;
|
||||
if (argc > 2) {
|
||||
fname_text = argv[2];
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
llama_backend_init(false);
|
||||
|
||||
// load the vocab
|
||||
{
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.vocab_only = true;
|
||||
|
||||
model = llama_load_model_from_file(fname.c_str(), lparams);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx = llama_new_context_with_model(model, lparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_SPM) {
|
||||
fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
return 2;
|
||||
}
|
||||
|
||||
bool success = true;
|
||||
|
||||
for (const auto & test_kv : k_tests()) {
|
||||
const std::vector<llama_token> res_bos = llama_tokenize(ctx, test_kv.first, true);
|
||||
const std::vector<llama_token> res_nobos = llama_tokenize(ctx, test_kv.first, false);
|
||||
|
||||
printf("\n");
|
||||
printf("src: '%s'\n", test_kv.first.c_str());
|
||||
printf("res: '%s'\n", llama_detokenize_spm(ctx, res_bos).c_str());
|
||||
printf("tok: ");
|
||||
for (const auto & tok : res_bos) {
|
||||
printf("%d ", tok);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1;
|
||||
|
||||
for (int i = 0; i < (int) res_nobos.size() && correct; ++i) {
|
||||
if (test_kv.second[i] != res_bos[i + 1]) {
|
||||
correct = false;
|
||||
}
|
||||
if (test_kv.second[i] != res_nobos[i]) {
|
||||
correct = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!correct) {
|
||||
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
||||
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
||||
llama_detokenize_spm(ctx, res_nobos).c_str(),
|
||||
llama_detokenize_spm(ctx, test_kv.second).c_str());
|
||||
fprintf(stderr, "%s : expected tokens: ", __func__);
|
||||
for (const auto & t : test_kv.second) {
|
||||
fprintf(stderr, "%6d, ", t);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s : got tokens: ", __func__);
|
||||
for (const auto & t : res_nobos) {
|
||||
fprintf(stderr, "%6d, ", t);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!fname_text.empty()) {
|
||||
fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
|
||||
|
||||
std::string text;
|
||||
{
|
||||
std::ifstream ifs(fname_text);
|
||||
if (!ifs) {
|
||||
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
|
||||
return 1;
|
||||
}
|
||||
text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
|
||||
|
||||
const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
|
||||
|
||||
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
|
||||
|
||||
{
|
||||
const std::string fname_out = fname_text + ".tokcpp";
|
||||
|
||||
std::ofstream ofs(fname_out);
|
||||
if (!ofs) {
|
||||
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (const auto & tok : res) {
|
||||
ofs << tok << " ";
|
||||
}
|
||||
|
||||
ofs << "\n";
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return success ? 0 : 3;
|
||||
}
|
||||
95
tests/test-tokenizer-0-llama.py
Normal file
95
tests/test-tokenizer-0-llama.py
Normal file
@@ -0,0 +1,95 @@
|
||||
# tests with SPM tokenizer
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
|
||||
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
|
||||
args = parser.parse_args()
|
||||
|
||||
dir_tokenizer = args.dir_tokenizer
|
||||
|
||||
tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
|
||||
|
||||
tests = [
|
||||
"",
|
||||
" ",
|
||||
" ",
|
||||
" ",
|
||||
"\t",
|
||||
"\n",
|
||||
"\t\n",
|
||||
"Hello world",
|
||||
" Hello world",
|
||||
"Hello World",
|
||||
" Hello World",
|
||||
" Hello World!",
|
||||
"Hello, world!",
|
||||
" Hello, world!",
|
||||
" this is 🦙.cpp",
|
||||
"w048 7tuijk dsdfhu",
|
||||
"нещо на Български",
|
||||
"កាន់តែពិសេសអាចខលចេញ",
|
||||
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||
"Hello",
|
||||
" Hello",
|
||||
" Hello",
|
||||
" Hello",
|
||||
" Hello",
|
||||
" Hello\n Hello",
|
||||
]
|
||||
|
||||
|
||||
for text in tests:
|
||||
print('text: ', text)
|
||||
print('\nwith bos:')
|
||||
print(tokenizer.encode(text, add_bos=True))
|
||||
print(tokenizer.decode(tokenizer.encode(text, add_bos=True)))
|
||||
print('\nwithout bos:')
|
||||
print(tokenizer.encode(text, add_bos=False))
|
||||
print(tokenizer.decode(tokenizer.encode(text, add_bos=False)))
|
||||
|
||||
print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello'
|
||||
print("'" + tokenizer.id_to_piece(29871) + "'") # '_'
|
||||
print("'" + tokenizer.decode([15043]) + "'") # 'Hello'
|
||||
print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello'
|
||||
print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello'
|
||||
print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello'
|
||||
|
||||
print("\n\ntests for C++:\n")
|
||||
for text in tests:
|
||||
res = tokenizer.encode(text, add_bos=False)
|
||||
|
||||
k = text.replace('\n', '\\n')
|
||||
k = k.replace('\t', '\\t')
|
||||
k = '"' + k + '"'
|
||||
print("{ %-24s, { " % k, end='')
|
||||
for x in res:
|
||||
print("%7d," % x, end='')
|
||||
print(" }, },")
|
||||
|
||||
print(tokenizer.encode('hello'))
|
||||
print(tokenizer.encode('world'))
|
||||
print(tokenizer.encode(' world'))
|
||||
print(tokenizer.encode('hello world'))
|
||||
|
||||
fname_tok = args.fname_tok
|
||||
if fname_tok:
|
||||
print('tokenizing file: ', fname_tok)
|
||||
fname_out = fname_tok + '.tok'
|
||||
with open(fname_tok, 'r') as f:
|
||||
lines = f.readlines()
|
||||
s = ''.join(lines)
|
||||
res = tokenizer.encode(s, add_bos=True)
|
||||
# write to file
|
||||
with open(fname_out, 'w') as f:
|
||||
for x in res:
|
||||
f.write(str(x) + ' ')
|
||||
f.write('\n')
|
||||
print('len(res): ', len(res))
|
||||
print('len(lines): ', len(lines))
|
||||
print('results written to: ', fname_out)
|
||||
@@ -1,141 +0,0 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
static std::string unescape_whitespace(llama_context* ctx, const std::vector<llama_token>& tokens) {
|
||||
std::string result;
|
||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||
result += llama_token_to_str(ctx, tokens[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||
{ " ", {1, 259, }, },
|
||||
{ " ", { 1, 1678, }, },
|
||||
{ " ", { 1, 268, }, },
|
||||
{ "\t", { 1, 29871, 12, }, },
|
||||
{ "\n", { 1, 29871, 13, }, },
|
||||
{ "\t\n", { 1, 29871, 12, 13, }, },
|
||||
{ "Hello world", { 1, 15043, 3186, }, },
|
||||
{ " Hello world", { 1, 29871, 15043, 3186, }, },
|
||||
{ "Hello World", { 1, 15043, 2787, }, },
|
||||
{ " Hello World", { 1, 29871, 15043, 2787, }, },
|
||||
{ " Hello World!", { 1, 29871, 15043, 2787, 29991, }, },
|
||||
{ " this is 🦙.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
||||
{ "w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
||||
{ "нещо на Български", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, },
|
||||
{ "កាន់តែពិសេសអាចខលចេញ", { 1, 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161,
|
||||
146, 228, 162, 133, 228, 161, 153, 228, 161, 186,
|
||||
31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228,
|
||||
161, 136, 228, 161, 132, 228, 161, 158, 228, 161,
|
||||
136, 228, 162, 132, 228, 161, 140, }, },
|
||||
{ "🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||
{ 1, 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871,
|
||||
243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598,
|
||||
313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681,
|
||||
313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
|
||||
{ "Hello", { 1, 15043 }, },
|
||||
{ " Hello", { 1, 29871, 15043 }, },
|
||||
{ " Hello", { 1, 259, 15043 }, },
|
||||
{ " Hello", { 1, 1678, 15043 }, },
|
||||
{ " Hello", { 1, 268, 15043 }, },
|
||||
{ " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043 }, },
|
||||
};
|
||||
|
||||
return _k_tests;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const std::string fname = argv[1];
|
||||
|
||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
llama_backend_init(false);
|
||||
|
||||
// load the vocab
|
||||
{
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.vocab_only = true;
|
||||
|
||||
model = llama_load_model_from_file(fname.c_str(), lparams);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx = llama_new_context_with_model(model, lparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
if (n_vocab != 32000) {
|
||||
fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
return 2;
|
||||
}
|
||||
|
||||
bool success = true;
|
||||
|
||||
for (const auto & test_kv : k_tests()) {
|
||||
// Add a space in front of the first character to match OG llama tokenizer behavior
|
||||
std::vector<llama_token> res = llama_tokenize(ctx, " " + test_kv.first, true);
|
||||
fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
|
||||
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());
|
||||
|
||||
bool correct = res.size() == test_kv.second.size();
|
||||
|
||||
for (int i = 0; i < (int) res.size() && correct; ++i) {
|
||||
if (res[i] != test_kv.second[i]) {
|
||||
correct = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!correct) {
|
||||
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
||||
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
||||
unescape_whitespace(ctx, res).c_str(), unescape_whitespace(ctx, test_kv.second).c_str());
|
||||
fprintf(stderr, "%s : expected tokens: ", __func__);
|
||||
for (const auto & t : test_kv.second) {
|
||||
fprintf(stderr, "%6d, ", t);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s : got tokens: ", __func__);
|
||||
for (const auto & t : res) {
|
||||
fprintf(stderr, "%6d, ", t);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return success ? 0 : 3;
|
||||
}
|
||||
@@ -22,14 +22,6 @@ static std::string escape_whitespace(const std::string& text) {
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string unescape_whitespace(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||
std::string result;
|
||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||
result += llama_token_to_str(ctx, tokens[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||
@@ -72,13 +64,13 @@ int main(int argc, char **argv) {
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
std::string forward = llama_token_to_str(ctx, i);
|
||||
std::string forward = llama_token_to_piece(ctx, i);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
|
||||
if (tokens.size() == 1) {
|
||||
if (i != tokens[0]) {
|
||||
std::string backward = llama_token_to_str(ctx, tokens[0]);
|
||||
std::string backward = llama_token_to_piece(ctx, tokens[0]);
|
||||
fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
|
||||
__func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
|
||||
__func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str());
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user