Compare commits

..

7 Commits

Author SHA1 Message Date
Georgi Gerganov
71e6ae3779 ggml : continue from #729 (wip) 2023-04-22 18:49:07 +03:00
Håkon H. Hitland
bd166f7ffc Fix type error in quantize_row_q4_1 for Arm NEON 2023-04-22 17:55:03 +03:00
Håkon H. Hitland
4282f9b0f3 Update quantize_row_q4_1 for PowerPC
Untested
2023-04-22 17:55:03 +03:00
Håkon H. Hitland
93c95fcc1b Update quantize_row_q4_0 for Arm NEON
Untested
2023-04-22 17:55:01 +03:00
Håkon H. Hitland
b7e704658e Update quantize_row_q4_0 for WASM
Untested
2023-04-22 17:54:19 +03:00
Håkon H. Hitland
5d5f2b2efa Update quantize_row_q4_0 for AVX/AVX2 2023-04-22 17:54:19 +03:00
Håkon H. Hitland
3698f79e6a Use full range for q4_0 quantization
By keeping the sign of the highest magnitude, we can make sure the
highest value maps to -8, which is currently unused.
This is a bit of a freebie since it is fully backwards compatible with
the current format.

quantize-stats output:
before(7B):
q4_0                                              : mse 0.00000492, maxerr 0.14257812
after(7B):
q4_0                                              : mse 0.00000386, maxerr 0.18200684

(Most layers have reduced maxerr under this rule, but the total max
error is indeed slightly higher)
2023-04-22 17:54:05 +03:00
219 changed files with 9305 additions and 110310 deletions

View File

@@ -1,23 +0,0 @@
---
Checks: >
bugprone-*,
-bugprone-easily-swappable-parameters,
-bugprone-implicit-widening-of-multiplication-result,
-bugprone-misplaced-widening-cast,
-bugprone-narrowing-conversions,
readability-*,
-readability-avoid-unconditional-preprocessor-if,
-readability-function-cognitive-complexity,
-readability-identifier-length,
-readability-implicit-bool-conversion,
-readability-magic-numbers,
-readability-uppercase-literal-suffix,
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
portability-*,
misc-*,
-misc-const-correctness,
-misc-non-private-member-variables-in-classes,
-misc-no-recursion,
FormatStyle: none

View File

@@ -1,22 +0,0 @@
node('x86_runner1'){ // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
stage('Cleanup'){
cleanWs() // Cleaning previous CI build in workspace
}
stage('checkout repo'){
retry(5){ // Retry if the cloning fails due to some reason
checkout scm // Clone the repo on Runner
}
}
stage('Compiling llama.cpp'){
sh'''#!/bin/bash
make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
'''
}
stage('Running llama.cpp'){
sh'''#!/bin/bash
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
cat llama_log.txt # Printing results
'''
}
}

View File

@@ -1,33 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git
COPY requirements.txt requirements.txt
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV LLAMA_CUBLAS=1
RUN make
ENTRYPOINT ["/app/.devops/tools.sh"]

View File

@@ -1,44 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH=\
gfx803 \
gfx900 \
gfx906 \
gfx908 \
gfx90a \
gfx1010 \
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102
COPY requirements.txt requirements.txt
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make
ENTRYPOINT ["/app/.devops/tools.sh"]

View File

@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git
apt-get install -y build-essential python3 python3-pip
COPY requirements.txt requirements.txt
@@ -16,6 +16,4 @@ COPY . .
RUN make
ENV LC_ALL=C.utf8
ENTRYPOINT ["/app/.devops/tools.sh"]

View File

@@ -1,84 +0,0 @@
# SRPM for building from source and packaging an RPM for RPM-based distros.
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
# Built and maintained by John Boero - boeroboy@gmail.com
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
# Notes for llama.cpp:
# 1. Tags are currently based on hash - which will not sort asciibetically.
# We need to declare standard versioning if people want to sort latest releases.
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
# It is up to the user to install the correct vendor-specific support.
Name: llama.cpp-clblast
Version: %( date "+%%Y%%m%%d" )
Release: 1%{?dist}
Summary: OpenCL Inference of LLaMA model in C/C++
License: MIT
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
Requires: clblast
URL: https://github.com/ggerganov/llama.cpp
%define debug_package %{nil}
%define source_date_epoch_from_changelog 0
%description
CPU inference for Meta's Lllama2 models using default options.
%prep
%setup -n llama.cpp-master
%build
make -j LLAMA_CLBLAST=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llamaclblast
cp -p server %{buildroot}%{_bindir}/llamaclblastserver
cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
[Unit]
Description=Llama.cpp server, CPU only (no GPU support in this build).
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
[Service]
Type=simple
EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID
Restart=never
[Install]
WantedBy=default.target
EOF
mkdir -p %{buildroot}/etc/sysconfig
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
EOF
%clean
rm -rf %{buildroot}
rm -rf %{_builddir}/*
%files
%{_bindir}/llamaclblast
%{_bindir}/llamaclblastserver
%{_bindir}/llamaclblastsimple
/usr/lib/systemd/system/llamaclblast.service
%config /etc/sysconfig/llama
%pre
%post
%preun
%postun
%changelog

View File

@@ -1,83 +0,0 @@
# SRPM for building from source and packaging an RPM for RPM-based distros.
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
# Built and maintained by John Boero - boeroboy@gmail.com
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
# Notes for llama.cpp:
# 1. Tags are currently based on hash - which will not sort asciibetically.
# We need to declare standard versioning if people want to sort latest releases.
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
# It is up to the user to install the correct vendor-specific support.
Name: llama.cpp-cublas
Version: %( date "+%%Y%%m%%d" )
Release: 1%{?dist}
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
License: MIT
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
BuildRequires: coreutils make gcc-c++ git cuda-toolkit
Requires: cuda-toolkit
URL: https://github.com/ggerganov/llama.cpp
%define debug_package %{nil}
%define source_date_epoch_from_changelog 0
%description
CPU inference for Meta's Lllama2 models using default options.
%prep
%setup -n llama.cpp-master
%build
make -j LLAMA_CUBLAS=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llamacppcublas
cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacublas.service
[Unit]
Description=Llama.cpp server, CPU only (no GPU support in this build).
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
[Service]
Type=simple
EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID
Restart=never
[Install]
WantedBy=default.target
EOF
mkdir -p %{buildroot}/etc/sysconfig
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
EOF
%clean
rm -rf %{buildroot}
rm -rf %{_builddir}/*
%files
%{_bindir}/llamacppcublas
%{_bindir}/llamacppcublasserver
%{_bindir}/llamacppcublassimple
/usr/lib/systemd/system/llamacublas.service
%config /etc/sysconfig/llama
%pre
%post
%preun
%postun
%changelog

View File

@@ -1,85 +0,0 @@
# SRPM for building from source and packaging an RPM for RPM-based distros.
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
# Built and maintained by John Boero - boeroboy@gmail.com
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
# Notes for llama.cpp:
# 1. Tags are currently based on hash - which will not sort asciibetically.
# We need to declare standard versioning if people want to sort latest releases.
# In the meantime, YYYYMMDD format will be used.
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
# It is up to the user to install the correct vendor-specific support.
Name: llama.cpp
Version: %( date "+%%Y%%m%%d" )
Release: 1%{?dist}
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
License: MIT
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
BuildRequires: coreutils make gcc-c++ git libstdc++-devel
Requires: libstdc++
URL: https://github.com/ggerganov/llama.cpp
%define debug_package %{nil}
%define source_date_epoch_from_changelog 0
%description
CPU inference for Meta's Lllama2 models using default options.
Models are not included in this package and must be downloaded separately.
%prep
%setup -n llama.cpp-master
%build
make -j
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llama
cp -p server %{buildroot}%{_bindir}/llamaserver
cp -p simple %{buildroot}%{_bindir}/llamasimple
mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
[Unit]
Description=Llama.cpp server, CPU only (no GPU support in this build).
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
[Service]
Type=simple
EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID
Restart=never
[Install]
WantedBy=default.target
EOF
mkdir -p %{buildroot}/etc/sysconfig
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
EOF
%clean
rm -rf %{buildroot}
rm -rf %{_builddir}/*
%files
%{_bindir}/llama
%{_bindir}/llamaserver
%{_bindir}/llamasimple
/usr/lib/systemd/system/llama.service
%config /etc/sysconfig/llama
%pre
%post
%preun
%postun
%changelog

View File

@@ -1,32 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential git
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV LLAMA_CUBLAS=1
RUN make
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
COPY --from=build /app/main /main
ENTRYPOINT [ "/main" ]

View File

@@ -1,44 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH=\
gfx803 \
gfx900 \
gfx906 \
gfx908 \
gfx90a \
gfx1010 \
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102
COPY requirements.txt requirements.txt
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make
ENTRYPOINT [ "/app/main" ]

View File

@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential git
apt-get install -y build-essential
WORKDIR /app
@@ -15,6 +15,4 @@ FROM ubuntu:$UBUNTU_VERSION as runtime
COPY --from=build /app/main /main
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ]

View File

@@ -7,35 +7,34 @@ arg1="$1"
# Shift the arguments to remove the first one
shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./main "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
# Join the remaining arguments into a single string
arg2="$@"
if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
python3 ./convert-pth-to-ggml.py $arg2
elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
./quantize $arg2
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
./main $arg2
elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do
if [ -f "${i/f16/q4_0}" ]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
./quantize "$i" "${i/f16/q4_0}" q4_0
./quantize "$i" "${i/f16/q4_0}" 2
fi
done
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
./server "$@"
else
echo "Unknown command: $arg1"
echo "Available commands: "
echo " --run (-r): Run a model previously converted into ggml"
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
echo " --convert (-c): Convert a llama model into ggml"
echo " ex: --outtype f16 \"/models/7B/\" "
echo " ex: \"/models/7B/\" 1"
echo " --quantize (-q): Optimize with quantization process ggml"
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
echo " --all-in-one (-a): Execute --convert & --quantize"
echo " ex: \"/models/\" 7B"
echo " --server (-s): Run a model on the server"
echo " ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080"
fi

View File

@@ -5,7 +5,14 @@
.vscode/
.DS_Store
build*/
build/
build-em/
build-debug/
build-release/
build-static/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
models/*

View File

@@ -17,6 +17,3 @@ indent_style = tab
[prompts/*.txt]
insert_final_newline = unset
[examples/server/public/*]
indent_size = 2

View File

@@ -1,2 +0,0 @@
[flake8]
max-line-length = 125

View File

@@ -10,41 +10,33 @@ on:
push:
branches:
- master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GGML_NLOOP: 3
GGML_N_THREADS: 1
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
jobs:
ubuntu-focal-make:
runs-on: ubuntu-20.04
ubuntu-latest-make:
runs-on: ubuntu-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
uses: actions/checkout@v1
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential gcc-8
sudo apt-get install build-essential
- name: Build
id: make_build
run: |
CC=gcc-8 make
- name: Test
id: make_test
run: |
CC=gcc-8 make tests
make test
make
ubuntu-latest-cmake:
runs-on: ubuntu-latest
@@ -52,7 +44,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
uses: actions/checkout@v1
- name: Dependencies
id: depends
@@ -72,7 +64,7 @@ jobs:
id: cmake_test
run: |
cd build
ctest --verbose --timeout 900
ctest --verbose
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest
@@ -87,7 +79,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
uses: actions/checkout@v1
- name: Dependencies
id: depends
@@ -103,40 +95,6 @@ jobs:
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }}
- name: Test
id: cmake_test
run: |
cd build
ctest --verbose --timeout 900
ubuntu-latest-cmake-mpi:
runs-on: ubuntu-latest
continue-on-error: true
strategy:
matrix:
mpi_library: [mpich, libopenmpi-dev]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential ${{ matrix.mpi_library }}
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake -DLLAMA_MPI=ON ..
cmake --build . --config Release
- name: Test
id: cmake_test
run: |
@@ -149,11 +107,10 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
uses: actions/checkout@v1
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
@@ -162,161 +119,50 @@ jobs:
run: |
make
- name: Test
id: make_test
run: |
make tests
make test
macOS-latest-cmake:
runs-on: macos-latest
runs-on: macOS-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
uses: actions/checkout@v1
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF ..
cmake -DLLAMA_AVX2=OFF ..
cmake --build . --config Release
- name: Test
id: cmake_test
run: |
cd build
ctest --verbose --timeout 900
macOS-latest-cmake-ios:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -G Xcode .. \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
cmake --build . --config Release
macOS-latest-cmake-tvos:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -G Xcode .. \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=tvOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
cmake --build . --config Release
ctest --verbose
windows-latest-cmake:
runs-on: windows-latest
env:
OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17
CLBLAST_VERSION: 1.6.0
strategy:
matrix:
include:
- build: 'noavx'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
- build: 'avx2'
defines: '-DLLAMA_BUILD_SERVER=ON'
- build: 'avx'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
- build: 'avx512'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'avx2'
defines: ''
- build: 'avx'
defines: '-DLLAMA_AVX2=OFF'
- build: 'avx512'
defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Download OpenCL SDK
id: get_opencl
if: ${{ matrix.build == 'clblast' }}
run: |
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
mkdir $env:RUNNER_TEMP/opencl
tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
- name: Download CLBlast
id: get_clblast
if: ${{ matrix.build == 'clblast' }}
run: |
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
$txt = Get-Content -Path $f -Raw
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
}
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas' }}
run: |
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
mkdir $env:RUNNER_TEMP/openblas
tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
uses: actions/checkout@v1
- name: Build
id: cmake_build
@@ -326,20 +172,6 @@ jobs:
cmake .. ${{ matrix.defines }}
cmake --build . --config Release
- name: Add clblast.dll
id: add_clblast_dll
if: ${{ matrix.build == 'clblast' }}
run: |
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
- name: Add libopenblas.dll
id: add_libopenblas_dll
if: ${{ matrix.build == 'openblas' }}
run: |
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
- name: Check AVX512F support
id: check_avx512f
if: ${{ matrix.build == 'avx512' }}
@@ -355,122 +187,28 @@ jobs:
- name: Test
id: cmake_test
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
if: ${{ matrix.build != 'avx512' || env.HAS_AVX512F == '1' }} # Test AVX-512 only when possible
run: |
cd build
ctest -C Release --verbose --timeout 900
ctest -C Release --verbose
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
with:
path: |
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
windows-latest-cmake-cublas:
runs-on: windows-latest
strategy:
matrix:
cuda: ['12.2.0', '11.7.1']
build: ['cublas']
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- uses: Jimver/cuda-toolkit@v0.2.11
id: cuda-toolkit
with:
cuda: ${{ matrix.cuda }}
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
cmake --build . --config Release
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
with:
path: |
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
- name: Copy and pack Cuda runtime
run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
$dst='.\build\bin\cudart\'
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
- name: Upload Cuda runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
with:
path: |
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
freeBSD-latest:
runs-on: macos-12
steps:
- name: Clone
uses: actions/checkout@v3
- name: Build
uses: cross-platform-actions/action@v0.19.0
with:
operating_system: freebsd
version: '13.2'
run: |
sudo pkg update
sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -478,44 +216,28 @@ jobs:
runs-on: ubuntu-latest
needs:
- ubuntu-focal-make
- ubuntu-latest-make
- ubuntu-latest-cmake
- macOS-latest-make
- macOS-latest-cmake
- windows-latest-cmake
- windows-latest-cmake-cublas
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v3
- name: Get commit hash
id: commit
uses: pr-mpt/actions-commit-hash@v2
- name: Create release
id: create_release
uses: anzz1/action-create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ steps.tag.outputs.name }}
tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
- name: Upload release
id: upload_release
@@ -548,7 +270,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v3
# uses: actions/checkout@v1
#
# - name: Dependencies
# run: |
@@ -572,7 +294,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v3
# uses: actions/checkout@v1
#
# - name: Dependencies
# run: |
@@ -596,7 +318,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v3
# uses: actions/checkout@v1
#
# - name: Dependencies
# run: |
@@ -626,7 +348,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v3
# uses: actions/checkout@v1
#
# - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1
@@ -665,7 +387,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v3
# uses: actions/checkout@v1
#
# - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1
@@ -711,7 +433,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v3
# uses: actions/checkout@v1
#
# - name: Dependencies
# run: |

View File

@@ -1,36 +0,0 @@
name: Code Coverage
on: [push, pull_request]
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
jobs:
run:
runs-on: ubuntu-20.04
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential gcc-8 lcov
- name: Build
run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
- name: Run tests
run: CC=gcc-8 make test
- name: Generate coverage report
run: |
make coverage
make lcov-report
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
with:
files: lcov-report/coverage.info

View File

@@ -26,15 +26,8 @@ jobs:
strategy:
matrix:
config:
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
# have disabled them for now until the reason why
# is understood.
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light", dockerfile: ".devops/main.Dockerfile" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile" }
steps:
- name: Check out the repo
uses: actions/checkout@v3
@@ -58,7 +51,7 @@ jobs:
with:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
platforms: linux/amd64,linux/arm64
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
@@ -67,6 +60,6 @@ jobs:
with:
context: .
push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platforms }}
platforms: linux/amd64,linux/arm64
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
file: ${{ matrix.config.dockerfile }}

View File

@@ -1,43 +0,0 @@
# This workflow will upload a Python Package using Twine when a GGUF release is created
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
# See `gguf-py/README.md` for how to make a release.
# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.
name: Upload Python Package
on:
workflow_dispatch:
push:
# Pattern matched against refs/tags
tags:
- 'gguf-v*' # Push events to every version tag
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.9.x'
- name: Install dependencies
run: |
cd gguf-py
python -m pip install poetry
poetry install
- name: Build package
run: poetry build
- name: Publish package
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_API_TOKEN }}

View File

@@ -1,20 +0,0 @@
name: clang-tidy review post comments
on:
workflow_dispatch:
workflows: ["clang-tidy-review"]
types:
- completed
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: ZedThree/clang-tidy-review/post@v0.13.0
# lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
with:
# adjust options as necessary
lgtm_comment_body: ''
annotations: false
max_comments: 25

View File

@@ -1,23 +0,0 @@
name: clang-tidy-review
on:
pull_request:
branches:
- master
jobs:
clang-tidy-review:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: ZedThree/clang-tidy-review@v0.13.0
id: review
with:
lgtm_comment_body: ''
build_dir: build
cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
split_workflow: true
- uses: ZedThree/clang-tidy-review/upload@v0.13.0

72
.gitignore vendored
View File

@@ -1,15 +1,5 @@
*.o
*.a
*.so
*.gguf
*.bin
*.exe
*.dll
*.log
*.gcov
*.gcno
*.gcda
*.dot
.DS_Store
.build/
.cache/
@@ -17,72 +7,36 @@
.envrc
.swiftpm
.venv
.clang-tidy
.vs/
.vscode/
lcov-report/
gcovr-report/
build*/
out/
tmp/
build/
build-em/
build-debug/
build-release/
build-static/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
models/*
models-mnt
/Pipfile
/baby-llama
/beam-search
/benchmark-matmult
/convert-llama2c-to-ggml
/embd-input-test
/embedding
/gguf
/gguf-llama-simple
/libllama.so
/llama-bench
/main
/metal
/perplexity
/quantize
/quantize-stats
/result
/save-load-state
/server
/simple
/speculative
/train-text-from-scratch
/perplexity
/embedding
/benchmark-q4_0-matmult
/vdot
build-info.h
/Pipfile
arm_neon.h
compile_commands.json
CMakeSettings.json
__pycache__
dist
zig-out/
zig-cache/
ppl-*.txt
qnt-*.txt
perf-*.txt
examples/jeopardy/results.txt
poetry.lock
poetry.toml
# Test binaries
tests/test-grammar-parser
tests/test-llama-grammar
tests/test-double-float
tests/test-grad0
tests/test-opt
tests/test-quantize-fns
tests/test-quantize-perf
tests/test-sampling
tests/test-tokenizer-0-llama
tests/test-tokenizer-0-falcon
tests/test-tokenizer-1

View File

@@ -1,15 +0,0 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
exclude: prompts/.*.txt
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.2.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
hooks:
- id: flake8

View File

@@ -36,94 +36,40 @@ endif()
# Option list
#
if (APPLE)
set(LLAMA_METAL_DEFAULT ON)
else()
set(LLAMA_METAL_DEFAULT OFF)
endif()
# general
option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
option(LLAMA_LTO "llama: enable link time optimization" OFF)
option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
option(LLAMA_LTO "llama: enable link time optimization" OFF)
# debug
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
option(LLAMA_GPROF "llama: enable gprof" OFF)
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
option(LLAMA_GPROF "llama: enable gprof" OFF)
# sanitizers
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
# instruction set specific
option(LLAMA_AVX "llama: enable AVX" ON)
option(LLAMA_AVX2 "llama: enable AVX2" ON)
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
option(LLAMA_FMA "llama: enable FMA" ON)
option(LLAMA_AVX "llama: enable AVX" ON)
option(LLAMA_AVX2 "llama: enable AVX2" ON)
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
option(LLAMA_FMA "llama: enable FMA" ON)
# in MSVC F16C is implied with AVX2/AVX512
if (NOT MSVC)
option(LLAMA_F16C "llama: enable F16C" ON)
option(LLAMA_F16C "llama: enable F16C" ON)
endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
#
# Build info header
#
# Generate initial build-info.h
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
# Is git submodule
if(NOT IS_DIRECTORY "${GIT_DIR}")
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
endif()
# Add a custom target for build-info.h
add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
# Add a custom command to rebuild build-info.h when .git/index changes
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS "${GIT_DIR}/index"
VERBATIM
)
else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
endif()
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
#
# Compile flags
@@ -135,7 +81,6 @@ set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
include(CheckCXXCompilerFlag)
if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD)
@@ -166,113 +111,36 @@ if (APPLE AND LLAMA_ACCELERATE)
endif()
endif()
if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
message(STATUS "Metal framework found")
set(GGML_HEADERS_METAL ggml-metal.h)
set(GGML_SOURCES_METAL ggml-metal.m)
add_compile_definitions(GGML_USE_METAL)
if (LLAMA_METAL_NDEBUG)
add_compile_definitions(GGML_METAL_NDEBUG)
endif()
# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
)
endif()
if (LLAMA_BLAS)
if (LLAMA_OPENBLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
endif()
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
set(BLA_SIZEOF_INTEGER 8)
endif()
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
set(BLA_VENDOR OpenBLAS)
find_package(BLAS)
if (BLAS_FOUND)
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
message(STATUS "OpenBLAS found")
if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
find_package(PkgConfig REQUIRED)
if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
pkg_check_modules(DepBLAS REQUIRED blas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
pkg_check_modules(DepBLAS REQUIRED openblas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
pkg_check_modules(DepBLAS REQUIRED blis)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
pkg_check_modules(DepBLAS REQUIRED blas-atlas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS")
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
# all Intel* libraries share the same include path
pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
# this doesn't provide pkg-config
# suggest to assign BLAS_INCLUDE_DIRS on your own
if ("${NVHPC_VERSION}" STREQUAL "")
message(WARNING "Better to set NVHPC_VERSION")
else()
set(DepBLAS_FOUND ON)
set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
endif()
endif()
if (DepBLAS_FOUND)
set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
else()
message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
" detected by pkgconfig, trying to find cblas.h from possible paths...")
find_path(BLAS_INCLUDE_DIRS
NAMES cblas.h
HINTS
/usr/include
/usr/local/include
/usr/include/openblas
/opt/homebrew/opt/openblas/include
/usr/local/opt/openblas/include
/usr/include/x86_64-linux-gnu/openblas/include
)
endif()
endif()
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
add_compile_options(${BLAS_LINKER_FLAGS})
add_compile_definitions(GGML_USE_OPENBLAS)
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
add_compile_definitions(GGML_BLAS_USE_MKL)
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
add_link_options(${BLAS_LIBRARIES})
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
# find header file
set(OPENBLAS_INCLUDE_SEARCH_PATHS
/usr/include
/usr/include/openblas
/usr/include/openblas-base
/usr/local/include
/usr/local/include/openblas
/usr/local/include/openblas-base
/opt/OpenBLAS/include
$ENV{OpenBLAS_HOME}
$ENV{OpenBLAS_HOME}/include
)
find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
add_compile_options(-I${OPENBLAS_INC})
else()
message(WARNING "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
" to set correct LLAMA_BLAS_VENDOR")
endif()
endif()
if (LLAMA_K_QUANTS)
set(GGML_HEADERS_EXTRA k_quants.h)
set(GGML_SOURCES_EXTRA k_quants.c)
add_compile_definitions(GGML_USE_K_QUANTS)
if (LLAMA_QKK_64)
add_compile_definitions(GGML_QKK_64)
message(WARNING "OpenBLAS not found")
endif()
endif()
@@ -285,25 +153,9 @@ if (LLAMA_CUBLAS)
enable_language(CUDA)
set(GGML_HEADERS_CUDA ggml-cuda.h)
set(GGML_SOURCES_CUDA ggml-cuda.cu)
set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
add_compile_definitions(GGML_USE_CUBLAS)
# if (LLAMA_CUDA_CUBLAS)
# add_compile_definitions(GGML_CUDA_CUBLAS)
# endif()
if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
if (DEFINED LLAMA_CUDA_DMMV_Y)
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
endif()
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
add_compile_definitions(GGML_CUDA_F16)
endif()
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -311,102 +163,11 @@ if (LLAMA_CUBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
else()
message(WARNING "cuBLAS not found")
endif()
endif()
if (LLAMA_MPI)
cmake_minimum_required(VERSION 3.10)
find_package(MPI)
if (MPI_C_FOUND)
message(STATUS "MPI found")
set(GGML_HEADERS_MPI ggml-mpi.h)
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
add_compile_definitions(GGML_USE_MPI)
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
set(cxx_flags ${cxx_flags} -Wno-cast-qual)
set(c_flags ${c_flags} -Wno-cast-qual)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
# Even if you're only using the C header, C++ programs may bring in MPI
# C++ functions, so more linkage is needed
if (MPI_CXX_FOUND)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_CXX_LIBRARIES})
endif()
else()
message(WARNING "MPI not found")
endif()
endif()
if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
message(STATUS "CLBlast found")
set(GGML_HEADERS_OPENCL ggml-opencl.h)
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
add_compile_definitions(GGML_USE_CLBLAST)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
else()
message(WARNING "CLBlast not found")
endif()
endif()
if (LLAMA_HIPBLAS)
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
endif()
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
endif()
find_package(hip)
find_package(hipblas)
find_package(rocblas)
if (${hipblas_FOUND} AND ${hip_FOUND})
message(STATUS "HIP and hipBLAS found")
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
if (BUILD_SHARED_LIBS)
set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
if (LLAMA_CUDA_FORCE_DMMV)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
endif()
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
if (LLAMA_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm)
else()
message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
endif()
endif()
if (LLAMA_ALL_WARNINGS)
if (NOT MSVC)
set(c_flags
@@ -418,9 +179,6 @@ if (LLAMA_ALL_WARNINGS)
-Wshadow
-Wstrict-prototypes
-Wpointer-arith
-Wmissing-prototypes
-Werror=implicit-int
-Wno-unused-function
)
set(cxx_flags
-Wall
@@ -430,10 +188,6 @@ if (LLAMA_ALL_WARNINGS)
-Wno-unused-function
-Wno-multichar
)
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
# g++ only
set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds)
endif()
else()
# todo : msvc
endif()
@@ -445,7 +199,7 @@ if (LLAMA_ALL_WARNINGS)
endif()
if (WIN32)
if (MSVC)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
if (BUILD_SHARED_LIBS)
@@ -467,13 +221,6 @@ endif()
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if (MSVC)
string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
else ()
set(CMAKE_GENERATOR_PLATFORM_LWR "")
endif ()
if (NOT MSVC)
if (LLAMA_STATIC)
add_link_options(-static)
@@ -489,33 +236,17 @@ if (NOT MSVC)
endif()
endif()
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
message(STATUS "ARM detected")
if (MSVC)
add_compile_definitions(__ARM_NEON)
add_compile_definitions(__ARM_FEATURE_FMA)
add_compile_definitions(__ARM_FEATURE_DOTPROD)
# add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
# TODO: arm msvc?
else()
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
add_compile_options(-mfp16-format=ieee)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
# Raspberry Pi 2
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Raspberry Pi 3, 4, Zero 2 (32-bit)
add_compile_options(-mno-unaligned-access)
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
add_compile_options(-mcpu=native)
endif()
# TODO: armv6,7,8 version specific flags
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
message(STATUS "x86 detected")
if (MSVC)
if (LLAMA_AVX512)
@@ -564,207 +295,54 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
add_compile_options(-mavx512vnni)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected")
add_compile_options(-mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
else()
# TODO: support PowerPC
message(STATUS "Unknown architecture")
endif()
#
# POSIX conformance
# Build libraries
#
# clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
# posix_memalign came in POSIX.1-2001 / SUSv3
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
add_compile_definitions(_XOPEN_SOURCE=600)
# Somehow in OpenBSD whenever POSIX conformance is specified
# some string functions rely on locale_t availability,
# which was introduced in POSIX.1-2008, forcing us to go higher
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
remove_definitions(-D_XOPEN_SOURCE=600)
add_compile_definitions(_XOPEN_SOURCE=700)
endif()
# Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
add_compile_definitions(_GNU_SOURCE)
endif()
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
# and on macOS its availability depends on enabling Darwin extensions
# similarly on DragonFly, enabling BSD extensions is necessary
if (
CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
CMAKE_SYSTEM_NAME MATCHES "iOS" OR
CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
CMAKE_SYSTEM_NAME MATCHES "DragonFly"
)
add_compile_definitions(_DARWIN_C_SOURCE)
endif()
# alloca is a non-standard interface that is not visible on BSDs when
# POSIX conformance is specified, but not all of them provide a clean way
# to enable it in such cases
if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
add_compile_definitions(__BSD_VISIBLE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
add_compile_definitions(_NETBSD_SOURCE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
add_compile_definitions(_BSD_SOURCE)
endif()
#
# libraries
#
# ggml
if (GGML_USE_CPU_HBM)
add_definitions(-DGGML_USE_CPU_HBM)
find_library(memkind memkind REQUIRED)
endif()
add_library(ggml OBJECT
ggml.c
ggml.h
ggml-alloc.c
ggml-alloc.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
)
${GGML_CUDA_SOURCES})
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
target_include_directories(ggml PUBLIC .)
target_compile_features(ggml PUBLIC c_std_11) # don't bump
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
if (GGML_USE_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind)
endif()
add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>)
if (BUILD_SHARED_LIBS)
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
install(TARGETS ggml_shared LIBRARY)
endif()
# llama
add_library(llama
llama.cpp
llama.h
)
llama_util.h)
target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
target_link_libraries(llama PRIVATE
ggml
${LLAMA_EXTRA_LIBS}
)
target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
if (BUILD_SHARED_LIBS)
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
if (LLAMA_METAL)
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
endif()
endif()
#
# install
#
include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}
CACHE PATH "Location of header files")
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}
CACHE PATH "Location of library files")
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
CACHE PATH "Location of binary files")
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
configure_package_config_file(
${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama
PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
LLAMA_LIB_INSTALL_DIR
LLAMA_BIN_INSTALL_DIR )
write_basic_package_version_file(
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
VERSION ${LLAMA_INSTALL_VERSION}
COMPATIBILITY SameMajorVersion)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
set(GGML_PUBLIC_HEADERS "ggml.h"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
install(TARGETS ggml PUBLIC_HEADER)
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h)
install(TARGETS llama LIBRARY PUBLIC_HEADER)
install(
FILES convert.py
PERMISSIONS
OWNER_READ
OWNER_WRITE
OWNER_EXECUTE
GROUP_READ
GROUP_EXECUTE
WORLD_READ
WORLD_EXECUTE
DESTINATION ${CMAKE_INSTALL_BINDIR})
install(
FILES convert-lora-to-ggml.py
PERMISSIONS
OWNER_READ
OWNER_WRITE
OWNER_EXECUTE
GROUP_READ
GROUP_EXECUTE
WORLD_READ
WORLD_EXECUTE
DESTINATION ${CMAKE_INSTALL_BINDIR})
if (LLAMA_METAL)
install(
FILES ggml-metal.metal
PERMISSIONS
OWNER_READ
OWNER_WRITE
GROUP_READ
WORLD_READ
DESTINATION ${CMAKE_INSTALL_BINDIR})
if (GGML_CUDA_SOURCES)
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
endif()
#
# programs, examples and tests
#
add_subdirectory(common)
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest)
add_subdirectory(tests)

598
Makefile
View File

@@ -1,11 +1,5 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
# Binaries only useful for tests
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
# Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
default: main quantize quantize-stats perplexity embedding vdot
ifndef UNAME_S
UNAME_S := $(shell uname -s)
@@ -19,13 +13,12 @@ ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif
CCV := $(shell $(CC) --version | head -n 1)
CXXV := $(shell $(CXX) --version | head -n 1)
# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin)
ifndef LLAMA_NO_METAL
LLAMA_METAL := 1
endif
ifneq ($(UNAME_P),arm)
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
ifeq ($(SYSCTL_M),1)
@@ -36,406 +29,110 @@ ifeq ($(UNAME_S),Darwin)
endif
endif
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
BUILD_TARGETS += metal
endif
default: $(BUILD_TARGETS)
test: $(TEST_TARGETS)
@failures=0; \
for test_target in $(TEST_TARGETS); do \
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
continue; \
else \
echo "Running test $$test_target..."; \
./$$test_target; \
fi; \
if [ $$? -ne 0 ]; then \
printf 'Test $$test_target FAILED!\n\n' $$test_target; \
failures=$$(( failures + 1 )); \
else \
printf 'Test %s passed.\n\n' $$test_target; \
fi; \
done; \
if [ $$failures -gt 0 ]; then \
printf '\n%s tests failed.\n' $$failures; \
exit 1; \
fi
@echo 'All tests passed.'
all: $(BUILD_TARGETS) $(TEST_TARGETS)
coverage: ## Run code coverage
gcov -pb tests/*.cpp
lcov-report: coverage ## Generate lcov report
mkdir -p lcov-report
lcov --capture --directory . --output-file lcov-report/coverage.info
genhtml lcov-report/coverage.info --output-directory lcov-report
gcovr-report: coverage ## Generate gcovr report
mkdir -p gcovr-report
gcovr --root . --html --html-details --output gcovr-report/coverage.html
ifdef RISCV_CROSS_COMPILE
CC := riscv64-unknown-linux-gnu-gcc
CXX := riscv64-unknown-linux-gnu-g++
endif
CCV := $(shell $(CC) --version | head -n 1)
CXXV := $(shell $(CXX) --version | head -n 1)
#
# Compile flags
#
# keep standard at C11 and C++11
# -Ofast tends to produce faster code, but may not be available for some compilers.
ifdef LLAMA_FAST
OPT = -Ofast
else
OPT = -O3
endif
MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = $(OPT) -std=c11 -fPIC
MK_CXXFLAGS = $(OPT) -std=c++11 -fPIC
MK_LDFLAGS =
# clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
# posix_memalign came in POSIX.1-2001 / SUSv3
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
MK_CPPFLAGS += -D_XOPEN_SOURCE=600
# Somehow in OpenBSD whenever POSIX conformance is specified
# some string functions rely on locale_t availability,
# which was introduced in POSIX.1-2008, forcing us to go higher
ifeq ($(UNAME_S),OpenBSD)
MK_CPPFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
endif
# Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc
ifeq ($(UNAME_S),Linux)
MK_CPPFLAGS += -D_GNU_SOURCE
endif
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
# and on macOS its availability depends on enabling Darwin extensions
# similarly on DragonFly, enabling BSD extensions is necessary
ifeq ($(UNAME_S),Darwin)
MK_CPPFLAGS += -D_DARWIN_C_SOURCE
endif
ifeq ($(UNAME_S),DragonFly)
MK_CPPFLAGS += -D__BSD_VISIBLE
endif
# alloca is a non-standard interface that is not visible on BSDs when
# POSIX conformance is specified, but not all of them provide a clean way
# to enable it in such cases
ifeq ($(UNAME_S),FreeBSD)
MK_CPPFLAGS += -D__BSD_VISIBLE
endif
ifeq ($(UNAME_S),NetBSD)
MK_CPPFLAGS += -D_NETBSD_SOURCE
endif
ifeq ($(UNAME_S),OpenBSD)
MK_CPPFLAGS += -D_BSD_SOURCE
endif
ifdef LLAMA_DEBUG
MK_CFLAGS += -O0 -g
MK_CXXFLAGS += -O0 -g
MK_LDFLAGS += -g
else
MK_CPPFLAGS += -DNDEBUG
endif
ifdef LLAMA_SERVER_VERBOSE
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
endif
ifdef LLAMA_CODE_COVERAGE
MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
endif
ifdef LLAMA_DISABLE_LOGS
MK_CPPFLAGS += -DLOG_DISABLE_LOGS
endif # LLAMA_DISABLE_LOGS
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =
# warnings
MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
ifeq '' '$(findstring clang,$(shell $(CXX) --version))'
# g++ only
MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
endif
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
# OS specific
# TODO: support Windows
ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
MK_CFLAGS += -pthread
MK_CXXFLAGS += -pthread
ifeq ($(UNAME_S),Linux)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
# detect Windows
ifneq ($(findstring _NT,$(UNAME_S)),)
_WIN32 := 1
ifeq ($(UNAME_S),Darwin)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
# library name prefix
ifneq ($(_WIN32),1)
LIB_PRE := lib
ifeq ($(UNAME_S),FreeBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
# Dynamic Shared Object extension
ifneq ($(_WIN32),1)
DSO_EXT := .so
else
DSO_EXT := .dll
ifeq ($(UNAME_S),NetBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
# Windows Sockets 2 (Winsock) for network-capable apps
ifeq ($(_WIN32),1)
LWINSOCK2 := -lws2_32
ifeq ($(UNAME_S),OpenBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifdef LLAMA_GPROF
MK_CFLAGS += -pg
MK_CXXFLAGS += -pg
endif
ifdef LLAMA_PERF
MK_CPPFLAGS += -DGGML_PERF
ifeq ($(UNAME_S),Haiku)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
ifndef RISCV
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
# Use all CPU extensions that are available:
MK_CFLAGS += -march=native -mtune=native
MK_CXXFLAGS += -march=native -mtune=native
CFLAGS += -march=native -mtune=native
CXXFLAGS += -march=native -mtune=native
# Usage AVX-only
#MK_CFLAGS += -mfma -mf16c -mavx
#MK_CXXFLAGS += -mfma -mf16c -mavx
# Usage SSSE3-only (Not is SSE3!)
#MK_CFLAGS += -mssse3
#MK_CXXFLAGS += -mssse3
#CFLAGS += -mfma -mf16c -mavx
#CXXFLAGS += -mfma -mf16c -mavx
endif
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
# https://github.com/ggerganov/llama.cpp/issues/2922
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
endif
ifneq ($(filter aarch64%,$(UNAME_M)),)
# Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit)
MK_CFLAGS += -mcpu=native
MK_CXXFLAGS += -mcpu=native
endif
ifneq ($(filter armv6%,$(UNAME_M)),)
# Raspberry Pi 1, Zero
MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
endif
ifneq ($(filter armv7%,$(UNAME_M)),)
# Raspberry Pi 2
MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif
ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 3, 4, Zero 2 (32-bit)
MK_CFLAGS += -mfp16-format=ieee -mno-unaligned-access
MK_CXXFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif
ifneq ($(filter ppc64%,$(UNAME_M)),)
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
ifneq (,$(findstring POWER9,$(POWER9_M)))
MK_CFLAGS += -mcpu=power9
MK_CXXFLAGS += -mcpu=power9
CFLAGS += -mcpu=power9
CXXFLAGS += -mcpu=power9
endif
# Require c++23's std::byteswap for big-endian support.
ifeq ($(UNAME_M),ppc64)
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
endif
endif
else
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif
ifndef LLAMA_NO_K_QUANTS
MK_CPPFLAGS += -DGGML_USE_K_QUANTS
OBJS += k_quants.o
ifdef LLAMA_QKK_64
MK_CPPFLAGS += -DGGML_QKK_64
endif
endif
ifndef LLAMA_NO_ACCELERATE
# Mac OS - include Accelerate framework.
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
# Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
ifeq ($(UNAME_S),Darwin)
MK_CPPFLAGS += -DGGML_USE_ACCELERATE
MK_LDFLAGS += -framework Accelerate
CFLAGS += -DGGML_USE_ACCELERATE
LDFLAGS += -framework Accelerate
endif
endif # LLAMA_NO_ACCELERATE
ifdef LLAMA_MPI
MK_CPPFLAGS += -DGGML_USE_MPI
MK_CFLAGS += -Wno-cast-qual
MK_CXXFLAGS += -Wno-cast-qual
OBJS += ggml-mpi.o
endif # LLAMA_MPI
endif
ifdef LLAMA_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
MK_LDFLAGS += $(shell pkg-config --libs openblas)
endif # LLAMA_OPENBLAS
ifdef LLAMA_BLIS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
MK_LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
LDFLAGS += -lopenblas
endif
ifdef LLAMA_CUBLAS
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
OBJS += ggml-cuda.o
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
ifdef LLAMA_CUDA_NVCC
NVCC = $(LLAMA_CUDA_NVCC)
else
NVCC = nvcc
endif #LLAMA_CUDA_NVCC
ifdef CUDA_DOCKER_ARCH
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
else
NVCCFLAGS += -arch=native
endif # CUDA_DOCKER_ARCH
ifdef LLAMA_CUDA_FORCE_DMMV
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # LLAMA_CUDA_FORCE_DMMV
ifdef LLAMA_CUDA_DMMV_X
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
else
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
endif # LLAMA_CUDA_DMMV_X
ifdef LLAMA_CUDA_MMV_Y
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
else ifdef LLAMA_CUDA_DMMV_Y
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
else
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
endif # LLAMA_CUDA_MMV_Y
ifdef LLAMA_CUDA_F16
NVCCFLAGS += -DGGML_CUDA_F16
endif # LLAMA_CUDA_F16
ifdef LLAMA_CUDA_DMMV_F16
NVCCFLAGS += -DGGML_CUDA_F16
endif # LLAMA_CUDA_DMMV_F16
ifdef LLAMA_CUDA_KQUANTS_ITER
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
else
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif
#ifdef LLAMA_CUDA_CUBLAS
# NVCCFLAGS += -DGGML_CUDA_CUBLAS
#endif # LLAMA_CUDA_CUBLAS
ifdef LLAMA_CUDA_CCBIN
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-linker -arch=native
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
# Mac provides OpenCL as a framework
ifeq ($(UNAME_S),Darwin)
MK_LDFLAGS += -lclblast -framework OpenCL
else
MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
endif
OBJS += ggml-opencl.o
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # LLAMA_CLBLAST
ifdef LLAMA_HIPBLAS
ROCM_PATH ?= /opt/rocm
HIPCC ?= $(ROCM_PATH)/bin/hipcc
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
ifdef LLAMA_CUDA_FORCE_DMMV
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # LLAMA_CUDA_FORCE_DMMV
OBJS += ggml-cuda.o
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
endif # LLAMA_HIPBLAS
ifdef LLAMA_METAL
MK_CPPFLAGS += -DGGML_USE_METAL
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJS += ggml-metal.o
ifdef LLAMA_METAL_NDEBUG
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@
endif
ifdef LLAMA_GPROF
CFLAGS += -pg
CXXFLAGS += -pg
endif
ifneq ($(filter aarch64%,$(UNAME_M)),)
CFLAGS += -mcpu=native
CXXFLAGS += -mcpu=native
endif
ifneq ($(filter armv6%,$(UNAME_M)),)
# Raspberry Pi 1, 2, 3
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
endif
ifneq ($(filter armv7%,$(UNAME_M)),)
# Raspberry Pi 4
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif
ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 4
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif
endif # LLAMA_METAL
ifdef LLAMA_METAL
ggml-metal.o: ggml-metal.m ggml-metal.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_METAL
ifdef LLAMA_MPI
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI
ifndef LLAMA_NO_K_QUANTS
k_quants.o: k_quants.c k_quants.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS
# combine build flags with cmdline overrides
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
#
# Print build information
@@ -456,149 +153,50 @@ $(info )
# Build library
#
ggml.o: ggml.c ggml.h ggml-cuda.h
ggml.o: ggml.c ggml.h
$(CC) $(CFLAGS) -c $< -o $@
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
$(CC) $(CFLAGS) -c $< -o $@
OBJS += ggml-alloc.o
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
llama.o: llama.cpp ggml.h llama.h llama_util.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common.o: common/common.cpp common/common.h build-info.h common/log.h
common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c $< -o $@
console.o: common/console.cpp common/console.h
$(CXX) $(CXXFLAGS) -c $< -o $@
grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
$(CXX) $(CXXFLAGS) -c $< -o $@
libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean:
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
#
# Examples
#
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ifdef LLAMA_METAL
metal: examples/metal/metal.cpp ggml.o $(OBJS)
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
endif
build-info.h: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh > $@.tmp
@if ! cmp -s $@.tmp $@; then \
mv $@.tmp $@; \
else \
rm $@.tmp; \
fi
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
#
# Tests
#
tests: $(TEST_TARGETS)
benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
./benchmark-q4_0-matmult
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
./$@
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-c.o: tests/test-c.c llama.h
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
.PHONY: tests
tests:
bash ./tests/run-tests.sh

View File

@@ -2,30 +2,8 @@
import PackageDescription
#if arch(arm) || arch(arm64)
let platforms: [SupportedPlatform]? = [
.macOS(.v11),
.iOS(.v14),
.watchOS(.v4),
.tvOS(.v14)
]
let exclude: [String] = []
let additionalSources: [String] = ["ggml-metal.m"]
let additionalSettings: [CSetting] = [
.unsafeFlags(["-fno-objc-arc"]),
.define("GGML_SWIFT"),
.define("GGML_USE_METAL")
]
#else
let platforms: [SupportedPlatform]? = nil
let exclude: [String] = ["ggml-metal.metal"]
let additionalSources: [String] = []
let additionalSettings: [CSetting] = []
#endif
let package = Package(
name: "llama",
platforms: platforms,
products: [
.library(name: "llama", targets: ["llama"]),
],
@@ -33,23 +11,13 @@ let package = Package(
.target(
name: "llama",
path: ".",
exclude: exclude,
sources: [
"ggml.c",
"llama.cpp",
"ggml-alloc.c",
"k_quants.c",
] + additionalSources,
sources: ["ggml.c", "llama.cpp"],
publicHeadersPath: "spm-headers",
cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32"]),
.define("GGML_USE_K_QUANTS"),
.define("GGML_USE_ACCELERATE")
] + additionalSettings,
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
linkerSettings: [
.linkedFramework("Accelerate")
]
)
),
],
cxxLanguageStandard: .cxx11
)

835
README.md

File diff suppressed because it is too large Load Diff

View File

@@ -1,27 +1,27 @@
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin
ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf models/7B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_1.bin
fcb7664c2e69776920b526362a243e912f73c36b1ec892eb354bab940f5edb5a models/7B/ggml-model-q4_0.bin
cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe models/7B/ggml-model-q4_1.bin
1bc7484c24a87612726d756f1761890e7acf5f412e23378577ce50fbe789b5b8 models/7B/ggml-model-q4_2.bin
3429bf198ec771886cf81a574df45245f3ebf04f0ce0956b73ef5d0ab01ff48b models/7B/ggml-model-q4_3.bin
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin
fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5 models/13B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_1.bin
4b69e4d6b6e3275230955997b90407fceca7e5ab3daf2e63a2c9e7270a8e1e3e models/13B/ggml-model-q4_0.bin
d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb models/13B/ggml-model-q4_1.bin
8d55a2077317ec9a928c7851d6a43e08e51f7e9e08360f2a7a7e1deefea3134f models/13B/ggml-model-q4_2.bin
4208cdec9788ffa48dc1a17af2c36a0299f5bf3eb0e2b87889dda7fad591fca3 models/13B/ggml-model-q4_3.bin
4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth
24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth
1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth
7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin
d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d models/30B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_1.bin
7a679908ce31c9d6ae2e38d6059bcd4d0ad3a870cd58cc1c8f7b36f2b2f51c73 models/30B/ggml-model-q4_0.bin
7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd models/30B/ggml-model-q4_1.bin
2c82b4954a94a6a284f452f6011c1e4f0d20362c194a0b1eb5737f5fd8a20fb3 models/30B/ggml-model-q4_2.bin
a6188660199dbcb8d5658abe7d89169869e50423494385830d9e6b330ea7fc33 models/30B/ggml-model-q4_3.bin
2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json
135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth
9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth
@@ -32,9 +32,9 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/con
72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth
d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth
60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin
cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92 models/65B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_1.bin
c671fe1bce71499ac732ec999770ebe53ac486623a7891e42c9dfdb6962d2c64 models/65B/ggml-model-q4_0.bin
4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f models/65B/ggml-model-q4_1.bin
4a145a210c56982389b1ed34387e0590c3e0d7325fa9be4f2284fe4d244a3633 models/65B/ggml-model-q4_2.bin
305e91a4608b4f627b9b8ad5b4af75187d2684254bfd76dcb9db571618ef293c models/65B/ggml-model-q4_3.bin
999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json
9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model

168
build.zig
View File

@@ -1,121 +1,61 @@
// Compatible with Zig Version 0.11.0
const std = @import("std");
const ArrayList = std.ArrayList;
const Compile = std.Build.Step.Compile;
const ConfigHeader = std.Build.Step.ConfigHeader;
const Mode = std.builtin.Mode;
const CrossTarget = std.zig.CrossTarget;
const Maker = struct {
builder: *std.build.Builder,
target: CrossTarget,
optimize: Mode,
config_header: *ConfigHeader,
enable_lto: bool,
pub fn build(b: *std.build.Builder) void {
const target = b.standardTargetOptions(.{});
const optimize = b.standardReleaseOptions();
const want_lto = b.option(bool, "lto", "Want -fLTO");
include_dirs: ArrayList([]const u8),
cflags: ArrayList([]const u8),
cxxflags: ArrayList([]const u8),
objs: ArrayList(*Compile),
const lib = b.addStaticLibrary("llama", null);
lib.want_lto = want_lto;
lib.setTarget(target);
lib.setBuildMode(optimize);
lib.linkLibCpp();
lib.addIncludePath(".");
lib.addIncludePath("examples");
lib.addCSourceFiles(&.{
"ggml.c",
}, &.{"-std=c11"});
lib.addCSourceFiles(&.{
"llama.cpp",
}, &.{"-std=c++11"});
lib.install();
fn addInclude(m: *Maker, dir: []const u8) !void {
try m.include_dirs.append(dir);
}
fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
}
fn addCFlag(m: *Maker, flag: []const u8) !void {
try m.cflags.append(flag);
}
fn addCxxFlag(m: *Maker, flag: []const u8) !void {
try m.cxxflags.append(flag);
}
fn addFlag(m: *Maker, flag: []const u8) !void {
try m.addCFlag(flag);
try m.addCxxFlag(flag);
const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
const exe = build_example("main", build_args);
_ = build_example("quantize", build_args);
_ = build_example("perplexity", build_args);
_ = build_example("embedding", build_args);
// create "zig build run" command for ./main
const run_cmd = exe.run();
run_cmd.step.dependOn(b.getInstallStep());
if (b.args) |args| {
run_cmd.addArgs(args);
}
fn init(builder: *std.build.Builder) !Maker {
const commit_hash = @embedFile(".git/refs/heads/master");
const config_header = builder.addConfigHeader(
.{ .style = .blank, .include_path = "build-info.h" },
.{
.BUILD_NUMBER = 0,
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
},
);
var m = Maker{
.builder = builder,
.target = builder.standardTargetOptions(.{}),
.optimize = builder.standardOptimizeOption(.{}),
.config_header = config_header,
.enable_lto = false,
.include_dirs = ArrayList([]const u8).init(builder.allocator),
.cflags = ArrayList([]const u8).init(builder.allocator),
.cxxflags = ArrayList([]const u8).init(builder.allocator),
.objs = ArrayList(*Compile).init(builder.allocator),
};
try m.addCFlag("-std=c11");
try m.addCxxFlag("-std=c++11");
try m.addProjectInclude(&.{});
try m.addProjectInclude(&.{"examples"});
return m;
}
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
if (std.mem.endsWith(u8, src, ".c")) {
o.addCSourceFiles(&.{src}, m.cflags.items);
o.linkLibC();
} else {
o.addCSourceFiles(&.{src}, m.cxxflags.items);
o.linkLibCpp();
}
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
o.want_lto = m.enable_lto;
return o;
}
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
e.addCSourceFiles(&.{src}, m.cxxflags.items);
for (deps) |d| e.addObject(d);
for (m.objs.items) |o| e.addObject(o);
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
e.linkLibC();
e.linkLibCpp();
e.addConfigHeader(m.config_header);
m.builder.installArtifact(e);
e.want_lto = m.enable_lto;
return e;
}
};
pub fn build(b: *std.build.Builder) !void {
var make = try Maker.init(b);
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
try make.addFlag("-DGGML_USE_K_QUANTS");
const k_quants = make.obj("k_quants", "k_quants.c");
try make.objs.append(k_quants);
}
const ggml = make.obj("ggml", "ggml.c");
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const llama = make.obj("llama", "llama.cpp");
const common = make.obj("common", "examples/common.cpp");
const console = make.obj("common", "examples/console.cpp");
const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}
const run_step = b.step("run", "Run the app");
run_step.dependOn(&run_cmd.step);
}
fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
const b = args.b;
const lib = args.lib;
const want_lto = args.want_lto;
const exe = b.addExecutable(name, null);
exe.want_lto = want_lto;
lib.setTarget(args.target);
lib.setBuildMode(args.optimize);
exe.addIncludePath(".");
exe.addIncludePath("examples");
exe.addCSourceFiles(&.{
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
"examples/common.cpp",
}, &.{"-std=c++11"});
exe.linkLibrary(lib);
exe.install();
return exe;
}

View File

@@ -1,25 +0,0 @@
# CI
In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
https://github.com/ggml-org/ci
It monitors the `master` branch for new commits and runs the
[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
to cover various hardware architectures, including GPU and Apple Silicon instances.
Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
Only the branches of this repo are monitored for this keyword.
It is a good practice, before publishing changes to execute the full CI locally on your machine:
```bash
mkdir tmp
# CPU-only build
bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with CUDA support
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
```

506
ci/run.sh
View File

@@ -1,506 +0,0 @@
#/bin/bash
#
# sample usage:
#
# mkdir tmp
#
# # CPU-only build
# bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with CUDA support
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>"
exit 1
fi
mkdir -p "$1"
mkdir -p "$2"
OUT=$(realpath "$1")
MNT=$(realpath "$2")
rm -v $OUT/*.log
rm -v $OUT/*.exit
rm -v $OUT/*.md
sd=`dirname $0`
cd $sd/../
SRC=`pwd`
## helpers
# download a file if it does not exist or if it is outdated
function gg_wget {
local out=$1
local url=$2
local cwd=`pwd`
mkdir -p $out
cd $out
# should not re-download if file is the same
wget -nv -N $url
cd $cwd
}
function gg_printf {
printf -- "$@" >> $OUT/README.md
}
function gg_run {
ci=$1
set -o pipefail
set -x
gg_run_$ci | tee $OUT/$ci.log
cur=$?
echo "$cur" > $OUT/$ci.exit
set +x
set +o pipefail
gg_sum_$ci
ret=$((ret | cur))
}
## ci
# ctest_debug
function gg_run_ctest_debug {
cd ${SRC}
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
set -e
(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
set +e
}
function gg_sum_ctest_debug {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs ctest in debug mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
gg_printf '```\n'
gg_printf '\n'
}
# ctest_release
function gg_run_ctest_release {
cd ${SRC}
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
else
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
fi
set +e
}
function gg_sum_ctest_release {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs ctest in release mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
gg_printf '```\n'
}
# open_llama_3b_v2
function gg_run_open_llama_3b_v2 {
cd ${SRC}
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
path_models="../models-mnt/open-llama/3B-v2"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert.py ${path_models}
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test_60="${path_wiki}/wiki.test-60.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
return 20
fi
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
return 0
}
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
# lora
function compare_ppl {
qnt="$1"
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
return 20
fi
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
return 0
}
path_lora="../models-mnt/open-llama/3B-v2/lora"
path_shakespeare="../models-mnt/shakespeare"
shakespeare="${path_shakespeare}/shakespeare.txt"
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
python3 ../convert-lora-to-ggml.py ${path_lora}
# f16
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0 + f16 lora-base
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
set +e
}
function gg_sum_open_llama_3b_v2 {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'OpenLLaMA 3B-v2:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
}
# open_llama_7b_v2
# requires: GG_BUILD_CUDA
function gg_run_open_llama_7b_v2 {
cd ${SRC}
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
path_models="../models-mnt/open-llama/7B-v2"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert.py ${path_models}
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test="${path_wiki}/wiki.test.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
return 20
fi
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
return 0
}
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
# lora
function compare_ppl {
qnt="$1"
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
return 20
fi
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
return 0
}
path_lora="../models-mnt/open-llama/7B-v2/lora"
path_shakespeare="../models-mnt/shakespeare"
shakespeare="${path_shakespeare}/shakespeare.txt"
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
python3 ../convert-lora-to-ggml.py ${path_lora}
# f16
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# currently not supported by the CUDA backend
# q8_0
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
#compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0 + f16 lora-base
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
#compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
set +e
}
function gg_sum_open_llama_7b_v2 {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'OpenLLaMA 7B-v2:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
#gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
}
## main
if [ -z ${GG_BUILD_LOW_PERF} ]; then
rm -rf ${SRC}/models-mnt
mnt_models=${MNT}/models
mkdir -p ${mnt_models}
ln -sfn ${mnt_models} ${SRC}/models-mnt
python3 -m pip install -r ${SRC}/requirements.txt
python3 -m pip install --editable gguf-py
fi
ret=0
test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then
if [ -z ${GG_BUILD_CUDA} ]; then
test $ret -eq 0 && gg_run open_llama_3b_v2
else
test $ret -eq 0 && gg_run open_llama_7b_v2
fi
fi
exit $ret

View File

@@ -1,14 +0,0 @@
comment: off
coverage:
status:
project:
default:
target: auto
threshold: 0
base: auto
patch:
default:
target: auto
threshold: 0
base: auto

View File

@@ -1,20 +0,0 @@
# common
set(TARGET common)
add_library(${TARGET} OBJECT
common.h
common.cpp
console.h
console.cpp
grammar-parser.h
grammar-parser.cpp
)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE llama)

File diff suppressed because it is too large Load Diff

View File

@@ -1,206 +0,0 @@
// Various helper functions and utilities
#pragma once
#include "llama.h"
#define LOG_NO_FILE_LINE_FUNCTION
#include "log.h"
#include <string>
#include <vector>
#include <random>
#include <thread>
#include <unordered_map>
#include <tuple>
#ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\'
#else
#define DIRECTORY_SEPARATOR '/'
#endif // _WIN32
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
//
// CLI argument parsing
//
int32_t get_num_physical_cores();
struct gpt_params {
uint32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t n_beams = 0; // if non-zero then use beam search of given width.
float rope_freq_base = 10000.0f; // RoPE base frequency
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
// sampling parameters
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // 1.0 = disabled
float repeat_penalty = 1.10f; // 1.0 = disabled
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float frequency_penalty = 0.00f; // 0.0 = disabled
float presence_penalty = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
// Classifier-Free Guidance
// https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // How strong is guidance
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
std::string prompt = "";
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::string grammar = ""; // optional BNF-like grammar to constrain sampling
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string logdir = ""; // directory in which to save YAML log files
std::string lora_adapter = ""; // lora adapter path
std::string lora_base = ""; // base model path for the lora adapter
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting)
//
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
bool embedding = false; // get only sentence embedding
bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool interactive_first = false; // wait for user input immediately
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
bool instruct = false; // instruction mode (used for Alpaca models)
bool penalize_nl = true; // consider newlines as a repeatable token
bool perplexity = false; // compute perplexity over the prompt
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool numa = false; // attempt optimizations that help on some NUMA systems
bool export_cgraph = false; // export the computation graph
bool verbose_prompt = false; // print prompt tokens before generation
};
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
//
// Model utils
//
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
//
// Vocab utils
//
// tokenizes a string into a vector of tokens
// should work similar to Python's `tokenizer.encode`
std::vector<llama_token> llama_tokenize(
struct llama_context * ctx,
const std::string & text,
bool add_bos);
// tokenizes a token into a piece
// should work similar to Python's `tokenizer.id_to_piece`
std::string llama_token_to_piece(
const struct llama_context * ctx,
llama_token token);
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
// that takes into account the tokenizer type and decides how to handle the leading space
//
// detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode`
// removes the leading space from the first non-BOS token
std::string llama_detokenize_spm(
llama_context * ctx,
const std::vector<llama_token> & tokens);
// detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode`
std::string llama_detokenize_bpe(
llama_context * ctx,
const std::vector<llama_token> & tokens);
//
// Sampling utils
//
// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
//
// required:
// - ctx: context to use for sampling
// - params: sampling parameters
//
// optional:
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
// - grammar: grammar to use for sampling, ignore if NULL
// - last_tokens: needed for repetition penalty, ignore if empty
// - idx: sample from llama_get_logits(ctx) + idx * n_vocab
//
// returns:
// - token: sampled token
// - candidates: vector of candidate tokens
//
llama_token llama_sample_token(
struct llama_context * ctx,
struct llama_context * ctx_guidance,
struct llama_grammar * grammar,
const struct gpt_params & params,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
int idx = 0);
//
// YAML utils
//
bool create_directory_with_parents(const std::string & path);
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
std::string get_sortable_timestamp();
void dump_non_result_info_yaml(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

View File

@@ -1,501 +0,0 @@
#include "console.h"
#include <vector>
#include <iostream>
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <fcntl.h>
#include <io.h>
#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004
#endif
#else
#include <climits>
#include <sys/ioctl.h>
#include <unistd.h>
#include <wchar.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <termios.h>
#endif
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_YELLOW "\x1b[33m"
#define ANSI_COLOR_BLUE "\x1b[34m"
#define ANSI_COLOR_MAGENTA "\x1b[35m"
#define ANSI_COLOR_CYAN "\x1b[36m"
#define ANSI_COLOR_RESET "\x1b[0m"
#define ANSI_BOLD "\x1b[1m"
namespace console {
//
// Console state
//
static bool advanced_display = false;
static bool simple_io = true;
static display_t current_display = reset;
static FILE* out = stdout;
#if defined (_WIN32)
static void* hConsole;
#else
static FILE* tty = nullptr;
static termios initial_state;
#endif
//
// Init and cleanup
//
void init(bool use_simple_io, bool use_advanced_display) {
advanced_display = use_advanced_display;
simple_io = use_simple_io;
#if defined(_WIN32)
// Windows-specific console initialization
DWORD dwMode = 0;
hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
hConsole = GetStdHandle(STD_ERROR_HANDLE);
if (hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(hConsole, &dwMode))) {
hConsole = nullptr;
simple_io = true;
}
}
if (hConsole) {
// Check conditions combined to reduce nesting
if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING) &&
!SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
advanced_display = false;
}
// Set console output codepage to UTF8
SetConsoleOutputCP(CP_UTF8);
}
HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
// Set console input codepage to UTF16
_setmode(_fileno(stdin), _O_WTEXT);
// Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
if (simple_io) {
dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT;
} else {
dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
}
if (!SetConsoleMode(hConIn, dwMode)) {
simple_io = true;
}
}
#else
// POSIX-specific console initialization
if (!simple_io) {
struct termios new_termios;
tcgetattr(STDIN_FILENO, &initial_state);
new_termios = initial_state;
new_termios.c_lflag &= ~(ICANON | ECHO);
new_termios.c_cc[VMIN] = 1;
new_termios.c_cc[VTIME] = 0;
tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
tty = fopen("/dev/tty", "w+");
if (tty != nullptr) {
out = tty;
}
}
setlocale(LC_ALL, "");
#endif
}
void cleanup() {
// Reset console display
set_display(reset);
#if !defined(_WIN32)
// Restore settings on POSIX systems
if (!simple_io) {
if (tty != nullptr) {
out = stdout;
fclose(tty);
tty = nullptr;
}
tcsetattr(STDIN_FILENO, TCSANOW, &initial_state);
}
#endif
}
//
// Display and IO
//
// Keep track of current display and only emit ANSI code if it changes
void set_display(display_t display) {
if (advanced_display && current_display != display) {
fflush(stdout);
switch(display) {
case reset:
fprintf(out, ANSI_COLOR_RESET);
break;
case prompt:
fprintf(out, ANSI_COLOR_YELLOW);
break;
case user_input:
fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
break;
case error:
fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
}
current_display = display;
fflush(out);
}
}
char32_t getchar32() {
#if defined(_WIN32)
HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
wchar_t high_surrogate = 0;
while (true) {
INPUT_RECORD record;
DWORD count;
if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
return WEOF;
}
if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
if (wc == 0) {
continue;
}
if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
high_surrogate = wc;
continue;
}
if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
if (high_surrogate != 0) { // Check if we have a high surrogate
return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
}
}
high_surrogate = 0; // Reset the high surrogate
return static_cast<char32_t>(wc);
}
}
#else
wchar_t wc = getwchar();
if (static_cast<wint_t>(wc) == WEOF) {
return WEOF;
}
#if WCHAR_MAX == 0xFFFF
if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
wchar_t low_surrogate = getwchar();
if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
}
}
if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
return 0xFFFD; // Return the replacement character U+FFFD
}
#endif
return static_cast<char32_t>(wc);
#endif
}
void pop_cursor() {
#if defined(_WIN32)
if (hConsole != NULL) {
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
COORD newCursorPosition = bufferInfo.dwCursorPosition;
if (newCursorPosition.X == 0) {
newCursorPosition.X = bufferInfo.dwSize.X - 1;
newCursorPosition.Y -= 1;
} else {
newCursorPosition.X -= 1;
}
SetConsoleCursorPosition(hConsole, newCursorPosition);
return;
}
#endif
putc('\b', out);
}
int estimateWidth(char32_t codepoint) {
#if defined(_WIN32)
(void)codepoint;
return 1;
#else
return wcwidth(codepoint);
#endif
}
int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
#if defined(_WIN32)
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
// go with the default
return expectedWidth;
}
COORD initialPosition = bufferInfo.dwCursorPosition;
DWORD nNumberOfChars = length;
WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
// Figure out our real position if we're in the last column
if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
DWORD nNumberOfChars;
WriteConsole(hConsole, &" \b", 2, &nNumberOfChars, NULL);
GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
}
int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
if (width < 0) {
width += newBufferInfo.dwSize.X;
}
return width;
#else
// We can trust expectedWidth if we've got one
if (expectedWidth >= 0 || tty == nullptr) {
fwrite(utf8_codepoint, length, 1, out);
return expectedWidth;
}
fputs("\033[6n", tty); // Query cursor position
int x1;
int y1;
int x2;
int y2;
int results = 0;
results = fscanf(tty, "\033[%d;%dR", &y1, &x1);
fwrite(utf8_codepoint, length, 1, tty);
fputs("\033[6n", tty); // Query cursor position
results += fscanf(tty, "\033[%d;%dR", &y2, &x2);
if (results != 4) {
return expectedWidth;
}
int width = x2 - x1;
if (width < 0) {
// Calculate the width considering text wrapping
struct winsize w;
ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
width += w.ws_col;
}
return width;
#endif
}
void replace_last(char ch) {
#if defined(_WIN32)
pop_cursor();
put_codepoint(&ch, 1, 1);
#else
fprintf(out, "\b%c", ch);
#endif
}
void append_utf8(char32_t ch, std::string & out) {
if (ch <= 0x7F) {
out.push_back(static_cast<unsigned char>(ch));
} else if (ch <= 0x7FF) {
out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
} else if (ch <= 0xFFFF) {
out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
} else if (ch <= 0x10FFFF) {
out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
} else {
// Invalid Unicode code point
}
}
// Helper function to remove the last UTF-8 character from a string
void pop_back_utf8_char(std::string & line) {
if (line.empty()) {
return;
}
size_t pos = line.length() - 1;
// Find the start of the last UTF-8 character (checking up to 4 bytes back)
for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
if ((line[pos] & 0xC0) != 0x80) {
break; // Found the start of the character
}
}
line.erase(pos);
}
bool readline_advanced(std::string & line, bool multiline_input) {
if (out != stdout) {
fflush(stdout);
}
line.clear();
std::vector<int> widths;
bool is_special_char = false;
bool end_of_stream = false;
char32_t input_char;
while (true) {
fflush(out); // Ensure all output is displayed before waiting for input
input_char = getchar32();
if (input_char == '\r' || input_char == '\n') {
break;
}
if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
end_of_stream = true;
break;
}
if (is_special_char) {
set_display(user_input);
replace_last(line.back());
is_special_char = false;
}
if (input_char == '\033') { // Escape sequence
char32_t code = getchar32();
if (code == '[' || code == 0x1B) {
// Discard the rest of the escape sequence
while ((code = getchar32()) != (char32_t) WEOF) {
if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
break;
}
}
}
} else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
if (!widths.empty()) {
int count;
do {
count = widths.back();
widths.pop_back();
// Move cursor back, print space, and move cursor back again
for (int i = 0; i < count; i++) {
replace_last(' ');
pop_cursor();
}
pop_back_utf8_char(line);
} while (count == 0 && !widths.empty());
}
} else {
int offset = line.length();
append_utf8(input_char, line);
int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
if (width < 0) {
width = 0;
}
widths.push_back(width);
}
if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
set_display(prompt);
replace_last(line.back());
is_special_char = true;
}
}
bool has_more = multiline_input;
if (is_special_char) {
replace_last(' ');
pop_cursor();
char last = line.back();
line.pop_back();
if (last == '\\') {
line += '\n';
fputc('\n', out);
has_more = !has_more;
} else {
// llama will just eat the single space, it won't act as a space
if (line.length() == 1 && line.back() == ' ') {
line.clear();
pop_cursor();
}
has_more = false;
}
} else {
if (end_of_stream) {
has_more = false;
} else {
line += '\n';
fputc('\n', out);
}
}
fflush(out);
return has_more;
}
bool readline_simple(std::string & line, bool multiline_input) {
#if defined(_WIN32)
std::wstring wline;
if (!std::getline(std::wcin, wline)) {
// Input stream is bad or EOF received
line.clear();
GenerateConsoleCtrlEvent(CTRL_C_EVENT, 0);
return false;
}
int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), NULL, 0, NULL, NULL);
line.resize(size_needed);
WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), &line[0], size_needed, NULL, NULL);
#else
if (!std::getline(std::cin, line)) {
// Input stream is bad or EOF received
line.clear();
return false;
}
#endif
if (!line.empty()) {
char last = line.back();
if (last == '/') { // Always return control on '/' symbol
line.pop_back();
return false;
}
if (last == '\\') { // '\\' changes the default action
line.pop_back();
multiline_input = !multiline_input;
}
}
line += '\n';
// By default, continue input if multiline_input is set
return multiline_input;
}
bool readline(std::string & line, bool multiline_input) {
set_display(user_input);
if (simple_io) {
return readline_simple(line, multiline_input);
}
return readline_advanced(line, multiline_input);
}
}

View File

@@ -1,19 +0,0 @@
// Console functions
#pragma once
#include <string>
namespace console {
enum display_t {
reset = 0,
prompt,
user_input,
error
};
void init(bool use_simple_io, bool use_advanced_display);
void cleanup();
void set_display(display_t display);
bool readline(std::string & line, bool multiline_input);
}

View File

@@ -1,424 +0,0 @@
#include "grammar-parser.h"
#include <cstdint>
#include <cwchar>
#include <string>
#include <utility>
#include <stdexcept>
#include <exception>
namespace grammar_parser {
// NOTE: assumes valid utf8 (but checks for overrun)
// copied from llama.cpp
std::pair<uint32_t, const char *> decode_utf8(const char * src) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t first_byte = static_cast<uint8_t>(*src);
uint8_t highbits = first_byte >> 4;
int len = lookup[highbits];
uint8_t mask = (1 << (8 - len)) - 1;
uint32_t value = first_byte & mask;
const char * end = src + len; // may overrun!
const char * pos = src + 1;
for ( ; pos < end && *pos; pos++) {
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
}
return std::make_pair(value, pos);
}
uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
return result.first->second;
}
uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
return next_id;
}
void add_rule(
parse_state & state,
uint32_t rule_id,
const std::vector<llama_grammar_element> & rule) {
if (state.rules.size() <= rule_id) {
state.rules.resize(rule_id + 1);
}
state.rules[rule_id] = rule;
}
bool is_word_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
}
std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
const char * pos = src;
const char * end = src + size;
uint32_t value = 0;
for ( ; pos < end && *pos; pos++) {
value <<= 4;
char c = *pos;
if ('a' <= c && c <= 'f') {
value += c - 'a' + 10;
} else if ('A' <= c && c <= 'F') {
value += c - 'A' + 10;
} else if ('0' <= c && c <= '9') {
value += c - '0';
} else {
break;
}
}
if (pos != end) {
throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
}
return std::make_pair(value, pos);
}
const char * parse_space(const char * src, bool newline_ok) {
const char * pos = src;
while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
(newline_ok && (*pos == '\r' || *pos == '\n'))) {
if (*pos == '#') {
while (*pos && *pos != '\r' && *pos != '\n') {
pos++;
}
} else {
pos++;
}
}
return pos;
}
const char * parse_name(const char * src) {
const char * pos = src;
while (is_word_char(*pos)) {
pos++;
}
if (pos == src) {
throw std::runtime_error(std::string("expecting name at ") + src);
}
return pos;
}
std::pair<uint32_t, const char *> parse_char(const char * src) {
if (*src == '\\') {
switch (src[1]) {
case 'x': return parse_hex(src + 2, 2);
case 'u': return parse_hex(src + 2, 4);
case 'U': return parse_hex(src + 2, 8);
case 't': return std::make_pair('\t', src + 2);
case 'r': return std::make_pair('\r', src + 2);
case 'n': return std::make_pair('\n', src + 2);
case '\\':
case '"':
case '[':
case ']':
return std::make_pair(src[1], src + 2);
default:
throw std::runtime_error(std::string("unknown escape at ") + src);
}
} else if (*src) {
return decode_utf8(src);
}
throw std::runtime_error("unexpected end of input");
}
const char * parse_alternates(
parse_state & state,
const char * src,
const std::string & rule_name,
uint32_t rule_id,
bool is_nested);
const char * parse_sequence(
parse_state & state,
const char * src,
const std::string & rule_name,
std::vector<llama_grammar_element> & out_elements,
bool is_nested) {
size_t last_sym_start = out_elements.size();
const char * pos = src;
while (*pos) {
if (*pos == '"') { // literal string
pos++;
last_sym_start = out_elements.size();
while (*pos != '"') {
auto char_pair = parse_char(pos);
pos = char_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '[') { // char range(s)
pos++;
enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
if (*pos == '^') {
pos++;
start_type = LLAMA_GRETYPE_CHAR_NOT;
}
last_sym_start = out_elements.size();
while (*pos != ']') {
auto char_pair = parse_char(pos);
pos = char_pair.second;
enum llama_gretype type = last_sym_start < out_elements.size()
? LLAMA_GRETYPE_CHAR_ALT
: start_type;
out_elements.push_back({type, char_pair.first});
if (pos[0] == '-' && pos[1] != ']') {
auto endchar_pair = parse_char(pos + 1);
pos = endchar_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
}
}
pos = parse_space(pos + 1, is_nested);
} else if (is_word_char(*pos)) { // rule reference
const char * name_end = parse_name(pos);
uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos);
pos = parse_space(name_end, is_nested);
last_sym_start = out_elements.size();
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
} else if (*pos == '(') { // grouping
// parse nested alternates into synthesized rule
pos = parse_space(pos + 1, true);
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
last_sym_start = out_elements.size();
// output reference to synthesized rule
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
if (*pos != ')') {
throw std::runtime_error(std::string("expecting ')' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
if (last_sym_start == out_elements.size()) {
throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
}
// apply transformation to previous symbol (last_sym_start to end) according to
// rewrite rules:
// S* --> S' ::= S S' |
// S+ --> S' ::= S S' | S
// S? --> S' ::= S |
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
std::vector<llama_grammar_element> sub_rule;
// add preceding symbol to generated rule
sub_rule.insert(
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
if (*pos == '*' || *pos == '+') {
// cause generated rule to recurse
sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
}
// mark start of alternate def
sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
if (*pos == '+') {
// add preceding symbol as alternate only for '+' (otherwise empty)
sub_rule.insert(
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
}
sub_rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, sub_rule_id, sub_rule);
// in original rule, replace previous symbol with reference to generated rule
out_elements.resize(last_sym_start);
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
pos = parse_space(pos + 1, is_nested);
} else {
break;
}
}
return pos;
}
const char * parse_alternates(
parse_state & state,
const char * src,
const std::string & rule_name,
uint32_t rule_id,
bool is_nested) {
std::vector<llama_grammar_element> rule;
const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
while (*pos == '|') {
rule.push_back({LLAMA_GRETYPE_ALT, 0});
pos = parse_space(pos + 1, true);
pos = parse_sequence(state, pos, rule_name, rule, is_nested);
}
rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, rule_id, rule);
return pos;
}
const char * parse_rule(parse_state & state, const char * src) {
const char * name_end = parse_name(src);
const char * pos = parse_space(name_end, false);
size_t name_len = name_end - src;
uint32_t rule_id = get_symbol_id(state, src, name_len);
const std::string name(src, name_len);
if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
throw std::runtime_error(std::string("expecting ::= at ") + pos);
}
pos = parse_space(pos + 3, true);
pos = parse_alternates(state, pos, name, rule_id, false);
if (*pos == '\r') {
pos += pos[1] == '\n' ? 2 : 1;
} else if (*pos == '\n') {
pos++;
} else if (*pos) {
throw std::runtime_error(std::string("expecting newline or end at ") + pos);
}
return parse_space(pos, true);
}
parse_state parse(const char * src) {
try {
parse_state state;
const char * pos = parse_space(src, true);
while (*pos) {
pos = parse_rule(state, pos);
}
return state;
} catch (const std::exception & err) {
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
return parse_state();
}
}
void print_grammar_char(FILE * file, uint32_t c) {
if (0x20 <= c && c <= 0x7f) {
fprintf(file, "%c", static_cast<char>(c));
} else {
// cop out of encoding UTF-8
fprintf(file, "<U+%04X>", c);
}
}
bool is_char_element(llama_grammar_element elem) {
switch (elem.type) {
case LLAMA_GRETYPE_CHAR: return true;
case LLAMA_GRETYPE_CHAR_NOT: return true;
case LLAMA_GRETYPE_CHAR_ALT: return true;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
default: return false;
}
}
void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
for (auto elem : rule) {
switch (elem.type) {
case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break;
case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break;
case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break;
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
}
switch (elem.type) {
case LLAMA_GRETYPE_END:
case LLAMA_GRETYPE_ALT:
case LLAMA_GRETYPE_RULE_REF:
fprintf(file, "(%u) ", elem.value);
break;
case LLAMA_GRETYPE_CHAR:
case LLAMA_GRETYPE_CHAR_NOT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
case LLAMA_GRETYPE_CHAR_ALT:
fprintf(file, "(\"");
print_grammar_char(file, elem.value);
fprintf(file, "\") ");
break;
}
}
fprintf(file, "\n");
}
void print_rule(
FILE * file,
uint32_t rule_id,
const std::vector<llama_grammar_element> & rule,
const std::map<uint32_t, std::string> & symbol_id_names) {
if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
throw std::runtime_error(
"malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
}
fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
llama_grammar_element elem = rule[i];
switch (elem.type) {
case LLAMA_GRETYPE_END:
throw std::runtime_error(
"unexpected end of rule: " + std::to_string(rule_id) + "," +
std::to_string(i));
case LLAMA_GRETYPE_ALT:
fprintf(file, "| ");
break;
case LLAMA_GRETYPE_RULE_REF:
fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
break;
case LLAMA_GRETYPE_CHAR:
fprintf(file, "[");
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_NOT:
fprintf(file, "[^");
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
if (i == 0 || !is_char_element(rule[i - 1])) {
throw std::runtime_error(
"LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
std::to_string(rule_id) + "," + std::to_string(i));
}
fprintf(file, "-");
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_ALT:
if (i == 0 || !is_char_element(rule[i - 1])) {
throw std::runtime_error(
"LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
std::to_string(rule_id) + "," + std::to_string(i));
}
print_grammar_char(file, elem.value);
break;
}
if (is_char_element(elem)) {
switch (rule[i + 1].type) {
case LLAMA_GRETYPE_CHAR_ALT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
break;
default:
fprintf(file, "] ");
}
}
}
fprintf(file, "\n");
}
void print_grammar(FILE * file, const parse_state & state) {
try {
std::map<uint32_t, std::string> symbol_id_names;
for (auto kv : state.symbol_ids) {
symbol_id_names[kv.second] = kv.first;
}
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
// fprintf(file, "%zu: ", i);
// print_rule_binary(file, state.rules[i]);
print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
// fprintf(file, "\n");
}
} catch (const std::exception & err) {
fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
}
}
std::vector<const llama_grammar_element *> parse_state::c_rules() {
std::vector<const llama_grammar_element *> ret;
ret.reserve(rules.size());
for (const auto & rule : rules) {
ret.push_back(rule.data());
}
return ret;
}
}

View File

@@ -1,29 +0,0 @@
// Implements a parser for an extended Backus-Naur form (BNF), producing the
// binary context-free grammar format specified by llama.h. Supports character
// ranges, grouping, and repetition operators. As an example, a grammar for
// arithmetic might look like:
//
// root ::= expr
// expr ::= term ([-+*/] term)*
// term ::= num | "(" space expr ")" space
// num ::= [0-9]+ space
// space ::= [ \t\n]*
#pragma once
#include "llama.h"
#include <vector>
#include <map>
#include <cstdint>
#include <string>
namespace grammar_parser {
struct parse_state {
std::map<std::string, uint32_t> symbol_ids;
std::vector<std::vector<llama_grammar_element>> rules;
std::vector<const llama_grammar_element *> c_rules();
};
parse_state parse(const char * src);
void print_grammar(FILE * file, const parse_state & state);
}

View File

@@ -1,643 +0,0 @@
#pragma once
#include <chrono>
#include <cstring>
#include <sstream>
#include <iostream>
#include <thread>
#include <vector>
#include <algorithm>
#include <cinttypes>
// --------------------------------
//
// Basic usage:
//
// --------
//
// The LOG() and LOG_TEE() macros are ready to go by default
// they do not require any initialization.
//
// LOGLN() and LOG_TEELN() are variants which automatically
// include \n character at the end of the log string.
//
// LOG() behaves exactly like printf, by default writing to a logfile.
// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
//
// Default logfile is named
// "llama.<threadID>.log"
// Default LOG_TEE() secondary output target is
// stderr
//
// Logs can be dynamically disabled or enabled using functions:
// log_disable()
// and
// log_enable()
//
// A log target can be changed with:
// log_set_target( string )
// creating and opening, or re-opening a file by string filename
// or
// log_set_target( FILE* )
// allowing to point at stderr, stdout, or any valid FILE* file handler.
//
// --------
//
// End of Basic usage.
//
// --------------------------------
// Specifies a log target.
// default uses log_handler() with "llama.log" log file
// this can be changed, by defining LOG_TARGET
// like so:
//
// #define LOG_TARGET (a valid FILE*)
// #include "log.h"
//
// or it can be simply redirected to stdout or stderr
// like so:
//
// #define LOG_TARGET stderr
// #include "log.h"
//
// The log target can also be redirected to a diffrent function
// like so:
//
// #define LOG_TARGET log_handler_diffrent()
// #include "log.h"
//
// FILE* log_handler_diffrent()
// {
// return stderr;
// }
//
// or:
//
// #define LOG_TARGET log_handler_another_one("somelog.log")
// #include "log.h"
//
// FILE* log_handler_another_one(char*filename)
// {
// static FILE* logfile = nullptr;
// (...)
// if( !logfile )
// {
// fopen(...)
// }
// (...)
// return logfile
// }
//
#ifndef LOG_TARGET
#define LOG_TARGET log_handler()
#endif
#ifndef LOG_TEE_TARGET
#define LOG_TEE_TARGET stderr
#endif
// Utility to obtain "pid" like unique process id and use it when creating log files.
inline std::string log_get_pid()
{
static std::string pid;
if (pid.empty())
{
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
// it's not the same as "pid" but is unique enough to solve multiple instances
// trying to write to the same log.
std::stringstream ss;
ss << std::this_thread::get_id();
pid = ss.str();
}
return pid;
}
// Utility function for generating log file names with unique id based on thread id.
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
// where the number is a runtime id of the current thread.
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)
// INTERNAL, DO NOT USE
inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
{
std::stringstream buf;
buf << log_file_basename;
buf << ".";
buf << log_get_pid();
buf << ".";
buf << log_file_extension;
return buf.str();
}
#ifndef LOG_DEFAULT_FILE_NAME
#define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
#endif
// Utility for turning #define values into string literals
// so we can have a define for stderr and
// we can print "stderr" instead of literal stderr, etc.
#define LOG_STRINGIZE1(s) #s
#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
// Allows disabling timestamps.
// in order to disable, define LOG_NO_TIMESTAMPS
// like so:
//
// #define LOG_NO_TIMESTAMPS
// #include "log.h"
//
#ifndef LOG_NO_TIMESTAMPS
#ifndef _MSC_VER
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#else
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#endif
#else
#define LOG_TIMESTAMP_FMT "%s"
#define LOG_TIMESTAMP_VAL ,""
#endif
#ifdef LOG_TEE_TIMESTAMPS
#ifndef _MSC_VER
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#else
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#endif
#else
#define LOG_TEE_TIMESTAMP_FMT "%s"
#define LOG_TEE_TIMESTAMP_VAL ,""
#endif
// Allows disabling file/line/function prefix
// in order to disable, define LOG_NO_FILE_LINE_FUNCTION
// like so:
//
// #define LOG_NO_FILE_LINE_FUNCTION
// #include "log.h"
//
#ifndef LOG_NO_FILE_LINE_FUNCTION
#ifndef _MSC_VER
#define LOG_FLF_FMT "[%24s:%5d][%24s] "
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#endif
#else
#define LOG_FLF_FMT "%s"
#define LOG_FLF_VAL ,""
#endif
#ifdef LOG_TEE_FILE_LINE_FUNCTION
#ifndef _MSC_VER
#define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#endif
#else
#define LOG_TEE_FLF_FMT "%s"
#define LOG_TEE_FLF_VAL ,""
#endif
// Utility for synchronizing log configuration state
// since std::optional was introduced only in c++17
enum LogTriState
{
LogTriStateSame,
LogTriStateFalse,
LogTriStateTrue
};
// INTERNAL, DO NOT USE
// USE LOG() INSTEAD
//
#ifndef _MSC_VER
#define LOG_IMPL(str, ...) \
{ \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
fflush(LOG_TARGET); \
} \
}
#else
#define LOG_IMPL(str, ...) \
{ \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TARGET); \
} \
}
#endif
// INTERNAL, DO NOT USE
// USE LOG_TEE() INSTEAD
//
#ifndef _MSC_VER
#define LOG_TEE_IMPL(str, ...) \
{ \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
fflush(LOG_TARGET); \
} \
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
{ \
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
fflush(LOG_TEE_TARGET); \
} \
}
#else
#define LOG_TEE_IMPL(str, ...) \
{ \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TARGET); \
} \
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
{ \
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TEE_TARGET); \
} \
}
#endif
// The '\0' as a last argument, is a trick to bypass the silly
// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
// so we can have a single macro which can be called just like printf.
// Main LOG macro.
// behaves like printf, and supports arguments the exact same way.
//
#ifndef _MSC_VER
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else
#define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
#endif
// Main TEE macro.
// does the same as LOG
// and
// simultaneously writes stderr.
//
// Secondary target can be changed just like LOG_TARGET
// by defining LOG_TEE_TARGET
//
#ifndef _MSC_VER
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
#endif
// LOG macro variants with auto endline.
#ifndef _MSC_VER
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n")
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
#endif
// INTERNAL, DO NOT USE
inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
{
static bool _initialized{false};
static bool _disabled{(filename.empty() && target == nullptr)};
static std::string log_current_filename{filename};
static FILE *log_current_target{target};
static FILE *logfile = nullptr;
if (change)
{
if (disable == LogTriStateTrue)
{
// Disable primary target
_disabled = true;
}
// If previously disabled, only enable, and keep previous target
else if (disable == LogTriStateFalse)
{
_disabled = false;
}
// Otherwise, process the arguments
else if (log_current_filename != filename || log_current_target != target)
{
_initialized = false;
}
}
if (_disabled)
{
// Log is disabled
return nullptr;
}
if (_initialized)
{
// with fallback in case something went wrong
return logfile ? logfile : stderr;
}
// do the (re)initialization
if (target != nullptr)
{
if (logfile != nullptr && logfile != stdout && logfile != stderr)
{
fclose(logfile);
}
log_current_filename = LOG_DEFAULT_FILE_NAME;
log_current_target = target;
logfile = target;
}
else
{
if (log_current_filename != filename)
{
if (logfile != nullptr && logfile != stdout && logfile != stderr)
{
fclose(logfile);
}
}
logfile = fopen(filename.c_str(), "w");
}
if (!logfile)
{
// Verify whether the file was opened, otherwise fallback to stderr
logfile = stderr;
fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
fflush(stderr);
// At this point we let the init flag be to true below, and let the target fallback to stderr
// otherwise we would repeatedly fopen() which was already unsuccessful
}
_initialized = true;
return logfile ? logfile : stderr;
}
// INTERNAL, DO NOT USE
inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
{
return log_handler1_impl(change, disable, filename, target);
}
// Disables logs entirely at runtime.
// Makes LOG() and LOG_TEE() produce no output,
// untill enabled back.
#define log_disable() log_disable_impl()
// INTERNAL, DO NOT USE
inline FILE *log_disable_impl()
{
return log_handler1_impl(true, LogTriStateTrue);
}
// Enables logs at runtime.
#define log_enable() log_enable_impl()
// INTERNAL, DO NOT USE
inline FILE *log_enable_impl()
{
return log_handler1_impl(true, LogTriStateFalse);
}
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
#define log_set_target(target) log_set_target_impl(target)
// INTERNAL, DO NOT USE
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }
// INTERNAL, DO NOT USE
inline FILE *log_handler() { return log_handler1_impl(); }
inline void log_test()
{
log_disable();
LOG("01 Hello World to nobody, because logs are disabled!\n")
log_enable();
LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET))
LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n")
log_set_target(stderr);
LOG("04 Hello World to stderr!\n")
LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n")
log_set_target(LOG_DEFAULT_FILE_NAME);
LOG("06 Hello World to default log file!\n")
log_set_target(stdout);
LOG("07 Hello World to stdout!\n")
log_set_target(LOG_DEFAULT_FILE_NAME);
LOG("08 Hello World to default log file again!\n")
log_disable();
LOG("09 Hello World _1_ into the void!\n")
log_enable();
LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n")
log_disable();
log_set_target("llama.anotherlog.log");
LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n")
log_enable();
LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n")
log_set_target("llama.yetanotherlog.log");
LOG("13 Hello World this time in yet new file?\n")
log_set_target(log_filename_generator("llama_autonamed", "log"));
LOG("14 Hello World in log with generated filename!\n")
#ifdef _MSC_VER
LOG_TEE("15 Hello msvc TEE without arguments\n")
LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test")
LOG_TEELN("17 Hello msvc TEELN without arguments\n")
LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test")
LOG("19 Hello msvc LOG without arguments\n")
LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test")
LOGLN("21 Hello msvc LOGLN without arguments\n")
LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test")
#endif
}
inline bool log_param_single_parse(const std::string & param)
{
if ( param == "--log-test")
{
log_test();
return true;
}
if ( param == "--log-disable")
{
log_disable();
return true;
}
if ( param == "--log-enable")
{
log_enable();
return true;
}
return false;
}
inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
{
if ( param == "--log-file")
{
if (!check_but_dont_parse)
{
log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
}
return true;
}
return false;
}
inline void log_print_usage()
{
printf("log options:\n");
/* format
printf(" -h, --help show this help message and exit\n");*/
/* spacing
printf("__-param----------------Description\n");*/
printf(" --log-test Run simple logging test\n");
printf(" --log-disable Disable trace logs\n");
printf(" --log-enable Enable trace logs\n");
printf(" --log-file Specify a log filename (without extension)\n");
printf(" Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
}
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
// INTERNAL, DO NOT USE
inline void log_dump_cmdline_impl(int argc, char **argv)
{
std::stringstream buf;
for (int i = 0; i < argc; ++i)
{
if (std::string(argv[i]).find(' ') != std::string::npos)
{
buf << " \"" << argv[i] <<"\"";
}
else
{
buf << " " << argv[i];
}
}
LOGLN("Cmd:%s", buf.str().c_str())
}
#define log_tostr(var) log_var_to_string_impl(var).c_str()
inline std::string log_var_to_string_impl(bool var)
{
return var ? "true" : "false";
}
inline std::string log_var_to_string_impl(std::string var)
{
return var;
}
inline std::string log_var_to_string_impl(const std::vector<int> & var)
{
std::stringstream buf;
buf << "[ ";
bool first = true;
for (auto e : var)
{
if (first)
{
first = false;
}
else
{
buf << ", ";
}
buf << std::to_string(e);
}
buf << " ]";
return buf.str();
}
#define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens) \
[&tokens, &ctx]() \
{ \
std::stringstream buf; \
buf << "[ "; \
\
bool first = true; \
for (const auto &token : tokens) \
{ \
if (!first) \
buf << ", "; \
else \
first = false; \
\
auto detokenized = llama_token_to_piece(ctx, token); \
\
detokenized.erase( \
std::remove_if( \
detokenized.begin(), \
detokenized.end(), \
[](const unsigned char c) { return !std::isprint(c); }), \
detokenized.end()); \
\
buf \
<< "'" << detokenized << "'" \
<< ":" << std::to_string(token); \
} \
buf << " ]"; \
\
return buf.str(); \
}() \
.c_str()
#ifdef LOG_DISABLE_LOGS
#undef LOG
#define LOG(...) // dummy stub
#undef LOGLN
#define LOGLN(...) // dummy stub
#undef LOG_TEE
#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
#undef LOG_TEELN
#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
#undef LOG_DISABLE
#define LOG_DISABLE() // dummy stub
#undef LOG_ENABLE
#define LOG_ENABLE() // dummy stub
#undef LOG_ENABLE
#define LOG_ENABLE() // dummy stub
#undef LOG_SET_TARGET
#define LOG_SET_TARGET(...) // dummy stub
#undef LOG_DUMP_CMDLINE
#define LOG_DUMP_CMDLINE(...) // dummy stub
#endif // LOG_DISABLE_LOGS

View File

@@ -1,304 +0,0 @@
#!/usr/bin/env python3
# HF baichuan --> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Any
import itertools
import gguf
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor # type: ignore[import]
if TYPE_CHECKING:
from typing import TypeAlias
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
# reverse HF permute back to original pth layout
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
r = weights.shape[0] // 3
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
r = weights.shape[0] // 3
return weights[r * n_part : r * n_part + r, ...]
def count_model_parts(dir_model: str) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
print("hello print: ",hparams["architectures"][0])
if hparams["architectures"][0] != "BaichuanForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
# get number of model parts
num_parts = count_model_parts(dir_model)
print(f"num_parts:{num_parts}\n")
ARCH=gguf.MODEL_ARCH.BAICHUAN
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["num_hidden_layers"]
head_count = hparams["num_attention_heads"]
if "num_key_value_heads" in hparams:
head_count_kv = hparams["num_key_value_heads"]
else:
head_count_kv = head_count
if "_name_or_path" in hparams:
hf_repo = hparams["_name_or_path"]
else:
hf_repo = ""
if "max_sequence_length" in hparams:
ctx_length = hparams["max_sequence_length"]
elif "max_position_embeddings" in hparams:
ctx_length = hparams["max_position_embeddings"]
elif "model_max_length" in hparams:
ctx_length = hparams["model_max_length"]
else:
print("gguf: can not find ctx length parameter.")
sys.exit()
gguf_writer.add_name(dir_model.name)
gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_tensor_data_layout("Meta AI original pth")
gguf_writer.add_context_length(ctx_length)
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
tokenizer_model_file = dir_model / 'tokenizer.model'
if not tokenizer_model_file.is_file():
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
sys.exit(1)
# vocab type sentencepiece
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
for i in range(tokenizer.vocab_size()):
text: bytes
score: float
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score = tokenizer.get_score(i)
toktype = 1 # defualt to normal token type
if tokenizer.is_unknown(i):
toktype = 2
if tokenizer.is_control(i):
toktype = 3
# toktype = 4 is user-defined = tokens from added_tokens.json
if tokenizer.is_unused(i):
toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
added_tokens_file = dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
addtokens_json = json.load(f)
print("gguf: get added tokens")
for key in addtokens_json:
tokens.append( key.encode("utf-8") )
scores.append(-1000.0)
toktypes.append(4) # user-defined token type
gguf_writer.add_tokenizer_model("llama")
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
tmp=model_part
for i in range(block_count):
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
print(f"Unpacking and permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
for name in model_part.keys():
data = model_part[name]
# we don't need these
if name.endswith(".rotary_emb.inv_freq"):
continue
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View File

@@ -1,281 +0,0 @@
#!/usr/bin/env python3
# HF falcon--> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
def bytes_to_unicode():
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
return dict(zip(bs, (chr(n) for n in cs)))
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "RWForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.FALCON
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["n_layer"]
gguf_writer.add_name("Falcon")
gguf_writer.add_context_length(2048) # not in config.json
gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"])
if "n_head_kv" in hparams:
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
else:
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
gguf_writer.add_file_type(ftype)
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []
tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
sys.exit(1)
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
print("gguf: get gpt2 tokenizer vocab")
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
for i in range(vocab_size):
if i in reverse_vocab:
try:
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode('utf-8'))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
tokens.append(text)
scores.append(0.0) # dymmy
toktypes.append(gguf.TokenType.NORMAL) # dummy
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform
n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
head_dim = hparams["hidden_size"] // n_head
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(dir_model / part_name, map_location="cpu")
for name in model_part.keys():
data = model_part[name]
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
# QKV tensor transform
# The original query_key_value tensor contains n_head_kv "kv groups",
# each consisting of n_head/n_head_kv query weights followed by one key
# and one value weight (shared by all query heads in the kv group).
# This layout makes it a big pain to work with in GGML.
# So we rearrange them here,, so that we have n_head query weights
# followed by n_head_kv key weights followed by n_head_kv value weights,
# in contiguous fashion.
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
if "query_key_value" in name:
qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
data = torch.cat((q,k,v)).reshape_as(data)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View File

@@ -1,251 +0,0 @@
#!/usr/bin/env python3
# HF gptneox--> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
return dict(zip(bs, (chr(n) for n in cs)))
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "GPTNeoXForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.GPTNEOX
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["num_hidden_layers"]
gguf_writer.add_name(dir_model.name)
gguf_writer.add_context_length(hparams["max_position_embeddings"])
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
gguf_writer.add_head_count(hparams["num_attention_heads"])
gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
sys.exit(1)
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
print("gguf: get gpt2 tokenizer vocab")
vocab_size = len(tokenizer_json["model"]["vocab"])
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
for i in range(vocab_size):
if i in reverse_vocab:
try:
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode('utf-8'))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
tokens.append(text)
gguf_writer.add_token_list(tokens)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
for name in model_part.keys():
data = model_part[name]
# we don't need these
if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
continue
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View File

@@ -1,451 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import math
import struct
import sys
from enum import IntEnum
from pathlib import Path
import numpy as np
import os
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
# Note: Does not support GGML_QKK_64
QK_K = 256
# Items here are (block size, type size)
GGML_QUANT_SIZES = {
gguf.GGMLQuantizationType.F32 : (1, 4),
gguf.GGMLQuantizationType.F16 : (1, 2),
gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
}
class GGMLFormat(IntEnum):
GGML = 0
GGMF = 1
GGJT = 2
class GGMLFType(IntEnum):
ALL_F32 = 0
MOSTLY_F16 = 1
MOSTLY_Q4_0 = 2
MOSTLY_Q4_1 = 3
MOSTLY_Q4_1_SOME_F16 = 4
MOSTLY_Q8_0 = 7
MOSTLY_Q5_0 = 8
MOSTLY_Q5_1 = 9
MOSTLY_Q2_K = 10
MOSTLY_Q3_K_S = 11
MOSTLY_Q3_K_M = 12
MOSTLY_Q3_K_L = 13
MOSTLY_Q4_K_S = 14
MOSTLY_Q4_K_M = 15
MOSTLY_Q5_K_S = 16
MOSTLY_Q5_K_M = 17
MOSTLY_Q6_K = 18
class Hyperparameters:
def __init__(self):
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
self.n_layer = self.n_rot = self.n_ff = 0
self.ftype = GGMLFType.ALL_F32
def set_n_ff(self, model):
ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
ff_tensor = model.tensors[ff_tensor_idx]
self.n_ff = ff_tensor.dims[1]
def load(self, data, offset):
(
self.n_vocab,
self.n_embd,
self.n_mult,
self.n_head,
self.n_layer,
self.n_rot,
ftype,
) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
try:
self.ftype = GGMLFType(ftype)
except ValueError:
raise ValueError(f'Invalid ftype {ftype}')
return 4 * 7
def __str__(self):
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
class Vocab:
def __init__(self, load_scores = True):
self.items = []
self.load_scores = load_scores
def load(self, data, offset, n_vocab):
orig_offset = offset
for _ in range(n_vocab):
itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
assert itemlen < 4096, 'Absurd vocab item length'
offset += 4
item_text = bytes(data[offset:offset + itemlen])
offset += itemlen
if self.load_scores:
item_score = struct.unpack('<f', data[offset:offset + 4])[0]
offset += 4
else:
item_score = 0.0
self.items.append((item_text, item_score))
return offset - orig_offset
class Tensor:
def __init__(self, use_padding = True):
self.name = None
self.dims: tuple[int, ...] = ()
self.dtype = None
self.start_offset = 0
self.len_bytes = np.int64(0)
self.use_padding = use_padding
def load(self, data, offset):
orig_offset = offset
(n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
assert name_len < 4096, 'Absurd tensor name length'
quant = GGML_QUANT_SIZES.get(dtype)
assert quant is not None, 'Unknown tensor type'
(blksize, tysize) = quant
offset += 12
self.dtype= dtype
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
offset += 4 * n_dims
self.name = bytes(data[offset:offset + name_len])
offset += name_len
pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
offset += pad
n_elems = np.prod(self.dims)
n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
self.start_offset = offset
self.len_bytes = n_bytes
offset += n_bytes
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
return offset - orig_offset
class GGMLModel:
def __init__(self):
self.hyperparameters = None
self.vocab = None
self.tensor_map = {}
self.tensors = []
def validate_header(self, data, offset):
magic = bytes(data[offset:offset + 4])
if magic == b'GGUF':
raise ValueError('File is already in GGUF format.')
if magic == b'lmgg':
self.file_format = GGMLFormat.GGML
self.format_version = 1
return 4
version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
if magic == b'fmgg':
if version != 1:
raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
self.file_format = GGMLFormat.GGMF
self.format_version = version
return 8
if magic == b'tjgg':
if version < 1 or version > 3:
raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
self.file_format = GGMLFormat.GGJT
self.format_version = version
return 8
raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
def validate_conversion(self, ftype):
err = ''
if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
if len(err) > 0:
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
def load(self, data, offset):
offset += self.validate_header(data, offset)
hp = Hyperparameters()
offset += hp.load(data, offset)
print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
self.validate_conversion(hp.ftype)
vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
offset += vocab.load(data, offset, hp.n_vocab)
tensors: list[Tensor] = []
tensor_map = {}
while offset < len(data):
tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
offset += tensor.load(data, offset)
tensor_map[tensor.name] = len(tensors)
tensors.append(tensor)
self.hyperparameters = hp
self.vocab = vocab
self.tensors = tensors
self.tensor_map = tensor_map
hp.set_n_ff(self)
return offset
class GGMLToGGUF:
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
hp = ggml_model.hyperparameters
self.model = ggml_model
self.data = data
self.cfg = cfg
self.params_override = params_override
self.vocab_override = vocab_override
self.special_vocab = special_vocab
if params_override is not None:
n_kv_head = params_override.n_head_kv
else:
if cfg.gqa == 1:
n_kv_head = hp.n_head
else:
gqa = float(cfg.gqa)
n_kv_head = None
for x in range(1, 256):
if float(hp.n_head) / float(x) == gqa:
n_kv_head = x
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
self.n_kv_head = n_kv_head
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
def save(self):
print('* Preparing to save GGUF file')
gguf_writer = gguf.GGUFWriter(
self.cfg.output,
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
use_temp_file = False )
self.add_params(gguf_writer)
self.add_vocab(gguf_writer)
if self.special_vocab is not None:
self.special_vocab.add_to_gguf(gguf_writer)
self.add_tensors(gguf_writer)
print(" gguf: write header")
gguf_writer.write_header_to_file()
print(" gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print(" gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
def add_params(self, gguf_writer):
hp = self.model.hyperparameters
cfg = self.cfg
if cfg.desc is not None:
desc = cfg.desc
else:
desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
try:
# Filenames aren't necessarily valid UTF8.
name = cfg.name if cfg.name is not None else cfg.input.name
except UnicodeDecodeError:
name = None
print('* Adding model parameters and KV items')
if name is not None:
gguf_writer.add_name(name)
gguf_writer.add_description(desc)
gguf_writer.add_file_type(int(hp.ftype))
if self.params_override is not None:
po = self.params_override
assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
gguf_writer.add_context_length (po.n_ctx)
gguf_writer.add_embedding_length (po.n_embd)
gguf_writer.add_block_count (po.n_layer)
gguf_writer.add_feed_forward_length (po.n_ff)
gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
gguf_writer.add_head_count (po.n_head)
gguf_writer.add_head_count_kv (po.n_head_kv)
gguf_writer.add_layer_norm_rms_eps (po.f_norm_eps)
return
gguf_writer.add_context_length(cfg.context_length)
gguf_writer.add_embedding_length(hp.n_embd)
gguf_writer.add_block_count(hp.n_layer)
gguf_writer.add_feed_forward_length(hp.n_ff)
gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
gguf_writer.add_head_count(hp.n_head)
gguf_writer.add_head_count_kv(self.n_kv_head)
gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
def add_vocab(self, gguf_writer):
hp = self.model.hyperparameters
gguf_writer.add_tokenizer_model('llama')
tokens = []
scores = []
toktypes = []
if self.vocab_override is not None:
vo = self.vocab_override
print('* Adding vocab item(s)')
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
tokens.append(vbytes)
scores.append(score)
toktypes.append(ttype)
assert len(tokens) == hp.n_vocab, \
f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
if len(toktypes) > 0:
gguf_writer.add_token_types(toktypes)
return
print(f'* Adding {hp.n_vocab} vocab item(s)')
assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
tt = 1 # Normal
# Special handling for UNK, BOS, EOS tokens.
if tokid <= 2:
if tokid == 0:
vbytes = b'<unk>'
tt = 2
elif tokid == 1:
vbytes = b'<s>'
tt = 3
else:
vbytes = b'</s>'
tt = 3
elif len(vbytes) == 0:
tt = 3 # Control
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
tt = 6 # Byte
else:
vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
toktypes.append(tt)
tokens.append(vbytes)
scores.append(vscore)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
gguf_writer.add_unk_token_id(0)
gguf_writer.add_bos_token_id(1)
gguf_writer.add_eos_token_id(2)
def add_tensors(self, gguf_writer):
tensor_map = self.name_map
data = self.data
print(f'* Adding {len(self.model.tensors)} tensor(s)')
for tensor in self.model.tensors:
name = str(tensor.name, 'UTF-8')
mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
assert mapped_name is not None, f'Bad name {name}'
tempdims = list(tensor.dims[:])
if len(tempdims) > 1:
temp = tempdims[1]
tempdims[1] = tempdims[0]
tempdims[0] = temp
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
gguf_writer.add_tensor(
mapped_name,
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
raw_shape = tempdims,
raw_dtype = tensor.dtype )
def handle_metadata(cfg, hp):
import convert
assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
hf_config_path = cfg.model_metadata_dir / "config.json"
orig_config_path = cfg.model_metadata_dir / "params.json"
# We pass a fake model here. "original" mode will check the shapes of some
# tensors if information is missing in the .json file: other than that, the
# model data isn't used so this should be safe (at least for now).
fakemodel = {
'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
}
fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
if hf_config_path.exists():
params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
elif orig_config_path.exists():
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
else:
raise ValueError('Unable to load metadata')
vocab = convert.load_vocab(
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
cfg.vocabtype )
# FIXME: Respect cfg.vocab_dir?
svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
convert.check_vocab_size(params, vocab)
return (params, vocab, svocab)
def handle_args():
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
parser.add_argument('--input', '-i', type = Path, required = True,
help = 'Input GGMLv3 filename')
parser.add_argument('--output', '-o', type = Path, required = True,
help ='Output GGUF filename')
parser.add_argument('--name',
help = 'Set model name')
parser.add_argument('--desc',
help = 'Set model description')
parser.add_argument('--gqa', type = int, default = 1,
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
parser.add_argument('--eps', default = '5.0e-06',
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
parser.add_argument('--context-length', '-c', type=int, default = 2048,
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
parser.add_argument('--model-metadata-dir', '-m', type = Path,
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
parser.add_argument("--vocab-dir", type=Path,
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
return parser.parse_args()
def main():
cfg = handle_args()
print(f'* Using config: {cfg}')
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
data = np.memmap(cfg.input, mode = 'r')
model = GGMLModel()
print('* Scanning GGML input file')
offset = model.load(data, 0)
print(f'* GGML model hyperparameters: {model.hyperparameters}')
vocab_override = None
params_override = None
special_vocab = None
if cfg.model_metadata_dir is not None:
(params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
print(f'* Overriding params: {params_override}')
print(f'* Overriding vocab: {vocab_override}')
print(f'* Special vocab: {special_vocab}')
else:
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
if model.file_format == GGMLFormat.GGML:
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
converter = GGMLToGGUF(model, data, cfg,
params_override = params_override,
vocab_override = vocab_override,
special_vocab = special_vocab )
converter.save()
print(f'* Successful completion. Output saved to: {cfg.output}')
if __name__ == '__main__':
main()

53
convert-lora-to-ggml.py Executable file → Normal file
View File

@@ -1,29 +1,27 @@
#!/usr/bin/env python3
from __future__ import annotations
import json
import os
import re
import struct
import sys
from typing import Any, BinaryIO, Sequence
from typing import Any, Dict, Sequence, TextIO
import numpy as np
import torch
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
HF_SUBLAYER_TO_GGML = {
"self_attn.q_proj": "attn_q",
"self_attn.k_proj": "attn_k",
"self_attn.v_proj": "attn_v",
"self_attn.o_proj": "attn_output",
"mlp.gate_proj": "ffn_gate",
"mlp.down_proj": "ffn_down",
"mlp.up_proj": "ffn_up",
"input_layernorm": "attn_norm",
"self_attn.q_proj": "attention.wq",
"self_attn.k_proj": "attention.wk",
"self_attn.v_proj": "attention.wv",
"self_attn.o_proj": "attention.wo",
"mlp.gate_proj": "feed_forward.w1",
"mlp.down_proj": "feed_forward.w2",
"mlp.up_proj": "feed_forward.w3",
"input_layernorm": "attention_norm",
"post_attention_layernorm": "ffn_norm",
# "norm": "norm",
# "embed_tokens": "tok_embeddings",
# "lm_head": "output",
}
@@ -40,7 +38,7 @@ def translate_tensor_name(t: str) -> str:
sys.exit(1)
output_string = (
f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
)
return output_string
else:
@@ -48,21 +46,14 @@ def translate_tensor_name(t: str) -> str:
sys.exit(1)
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
fout.write(b"ggla"[::-1]) # magic (ggml lora)
fout.write(struct.pack("i", 1)) # file version
fout.write(struct.pack("i", params["r"]))
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
# but some models ship a float value instead
# let's convert to int, but fail if lossless conversion is not possible
assert (
int(params["lora_alpha"]) == params["lora_alpha"]
), "cannot convert float to int losslessly"
fout.write(struct.pack("i", int(params["lora_alpha"])))
fout.write(struct.pack("ii", params["r"], params["lora_alpha"]))
def write_tensor_header(
self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
self, name: str, shape: Sequence[int], data_type: DataType
) -> None:
sname = name.encode("utf-8")
fout.write(
@@ -70,7 +61,7 @@ def write_tensor_header(
"iii",
len(shape),
len(sname),
NUMPY_TYPE_TO_FTYPE[data_type.name],
DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
)
)
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
@@ -98,7 +89,7 @@ if params["peft_type"] != "LORA":
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
sys.exit(1)
if params["fan_in_fan_out"] is True:
if params["fan_in_fan_out"] == True:
print("Error: param fan_in_fan_out is not supported")
sys.exit(1)
@@ -117,10 +108,6 @@ with open(output_path, "wb") as fout:
write_file_header(fout, params)
for k, v in model.items():
if k.endswith(".default.weight"):
k = k.replace(".default.weight", ".weight")
if k in ["llama_proj.weight", "llama_proj.bias"]:
continue
if k.endswith("lora_A.weight"):
if v.dtype != torch.float16 and v.dtype != torch.float32:
v = v.float()
@@ -128,7 +115,7 @@ with open(output_path, "wb") as fout:
else:
v = v.float()
t = v.detach().numpy()
t = v.numpy()
tname = translate_tensor_name(k)
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
write_tensor_header(fout, tname, t.shape, t.dtype)

11
convert-pth-to-ggml.py Normal file
View File

@@ -0,0 +1,11 @@
# Compatibility stub
import argparse
import convert
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
parser.add_argument('dir_model', help='directory containing the model checkpoint')
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
args = parser.parse_args()
convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])

View File

@@ -1,248 +0,0 @@
#!/usr/bin/env python3
# HF starcoder --> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
def bytes_to_unicode():
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
return dict(zip(bs, (chr(n) for n in cs)))
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.STARCODER
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["n_layer"]
gguf_writer.add_name("StarCoder")
gguf_writer.add_context_length(hparams["n_positions"])
gguf_writer.add_embedding_length(hparams["n_embd"])
gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"])
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
gguf_writer.add_file_type(ftype)
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []
tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
sys.exit(1)
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
print("gguf: get gpt2 tokenizer vocab")
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
for i in range(vocab_size):
if i in reverse_vocab:
try:
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode('utf-8'))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
tokens.append(text)
scores.append(0.0) # dymmy
toktypes.append(gguf.TokenType.NORMAL) # dummy
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform
n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
head_dim = hparams["n_embd"] // n_head
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(dir_model / part_name, map_location="cpu")
for name in model_part.keys():
data = model_part[name]
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

1393
convert.py Executable file → Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,67 +0,0 @@
BLIS Installation Manual
------------------------
BLIS is a portable software framework for high-performance BLAS-like dense linear algebra libraries. It has received awards and recognition, including the 2023 James H. Wilkinson Prize for Numerical Software and the 2020 SIAM Activity Group on Supercomputing Best Paper Prize. BLIS provides a new BLAS-like API and a compatibility layer for traditional BLAS routine calls. It offers features such as object-based API, typed API, BLAS and CBLAS compatibility layers.
Project URL: https://github.com/flame/blis
### Prepare:
Compile BLIS:
```bash
git clone https://github.com/flame/blis
cd blis
./configure --enable-cblas -t openmp,pthreads auto
# will install to /usr/local/ by default.
make -j
```
Install BLIS:
```bash
sudo make install
```
We recommend using openmp since it's easier to modify the cores been used.
### llama.cpp compilation
Makefile:
```bash
make LLAMA_BLIS=1 -j
# make LLAMA_BLIS=1 benchmark-matmult
```
CMake:
```bash
mkdir build
cd build
cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
make -j
```
### llama.cpp execution
According to the BLIS documentation, we could set the following
environment variables to modify the behavior of openmp:
```
export GOMP_GPU_AFFINITY="0-19"
export BLIS_NUM_THREADS=14
```
And then run the binaries as normal.
### Intel specific issue
Some might get the error message saying that `libimf.so` cannot be found.
Please follow this [stackoverflow page](https://stackoverflow.com/questions/70687930/intel-oneapi-2022-libimf-so-no-such-file-or-directory-during-openmpi-compila).
### Reference:
1. https://github.com/flame/blis#getting-started
2. https://github.com/flame/blis/blob/master/docs/Multithreading.md

View File

@@ -1,40 +0,0 @@
# Token generation performance troubleshooting
## Verifying that the model is running on the GPU with cuBLAS
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
```shell
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
```
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
```shell
llama_model_load_internal: [cublas] offloading 60 layers to GPU
llama_model_load_internal: [cublas] offloading output layer to GPU
llama_model_load_internal: [cublas] total VRAM used: 17223 MB
... rest of inference
```
If you see these lines, then the GPU is being used.
## Verifying that the CPU is not oversaturated
llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physicial CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
# Example of runtime flags effect on inference speed benchmark
These runs were tested on the following machine:
GPU: A6000 (48GB VRAM)
CPU: 7 physical cores
RAM: 32GB
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
Result:
| command | tokens/second (higher is better) |
| - | - |
| -ngl 2000000 | N/A (less than 0.1) |
| -t 7 | 1.7 |
| -t 1 -ngl 2000000 | 5.5 |
| -t 7 -ngl 2000000 | 8.7 |
| -t 4 -ngl 2000000 | 9.1 |

View File

@@ -6,6 +6,23 @@ find_package(Threads REQUIRED)
# ...
# common
set(TARGET common)
add_library(${TARGET} OBJECT
common.h
common.cpp
)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE llama)
# examples
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
@@ -17,20 +34,4 @@ else()
add_subdirectory(quantize-stats)
add_subdirectory(perplexity)
add_subdirectory(embedding)
add_subdirectory(save-load-state)
add_subdirectory(benchmark)
add_subdirectory(baby-llama)
add_subdirectory(train-text-from-scratch)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(embd-input)
add_subdirectory(llama-bench)
add_subdirectory(beam-search)
if (LLAMA_METAL)
add_subdirectory(metal)
endif()
if (LLAMA_BUILD_SERVER)
add_subdirectory(server)
endif()
endif()

View File

@@ -2,21 +2,21 @@
set -e
AI_NAME="${AI_NAME:-Miku}"
MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
USER_NAME="${USER_NAME:-Anon}"
# Uncomment and adjust to the number of CPU cores you want to use.
#N_THREAD="${N_THREAD:-4}"
CTX_SIZE="${CTX_SIZE:-4096}"
N_PREDICTS="${N_PREDICTS:-4096}"
GEN_OPTIONS=(--batch_size 1024
--ctx_size "$CTX_SIZE"
--ctx_size 2048
--keep -1
--repeat_last_n 256
--repeat_penalty 1.17647
--temp 0.6
--mirostat 2)
--temp 0.7
--top_k 40
--top_p 0.5)
if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD")
@@ -24,24 +24,23 @@ fi
./main "${GEN_OPTIONS[@]}" \
--model "$MODEL" \
--in-prefix " " \
--in-suffix "${AI_NAME}:" \
--n_predict "$N_PREDICTS" \
--color --interactive \
--reverse-prompt "${USER_NAME}:" \
--prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
--prompt "
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
The conversation is only between ${USER_NAME} and ${AI_NAME}.
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
${AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
The conversation is only between ${USER_NAME} and ${AI_NAME}
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
${AI_NAME} can only communicate through text, so she can't send images or videos.
${USER_NAME}: Hello!
${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk, so it's important that I make a good first impression!
${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant (or whatever you like!), it's so nice to meet you! ^_^
${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!

View File

@@ -7,7 +7,7 @@
cd `dirname $0`
cd ..
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
./main -m ./models/ggml-alpaca-7b-q4.bin \
--color \
-f ./prompts/alpaca.txt \
--ctx_size 2048 \

View File

@@ -1,5 +0,0 @@
set(TARGET baby-llama)
add_executable(${TARGET} baby-llama.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

File diff suppressed because it is too large Load Diff

View File

@@ -1,8 +0,0 @@
set(TARGET beam-search)
add_executable(${TARGET} beam-search.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,186 +0,0 @@
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#include <signal.h>
#endif
// Used for debugging to print out beam tokens.
struct ostream_beam_view {
llama_context * ctx;
llama_beam_view beam_view;
};
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
}
return os << ')';
}
// Put here anything you want back in beam_search_callback().
struct beam_search_callback_data {
llama_context * ctx;
std::vector<llama_token> response;
};
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc.
bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
}
// Function matching type llama_beam_search_callback_fn_t.
// Custom callback example is called each time the beams lengths increase:
// * Show progress by printing ',' following by number of convergent beam tokens if any.
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
// Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
llama_beam_view& beam_view = beams_state.beam_views[i];
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
beam_view.eob = true;
}
}
printf(","); // Show progress
if (const size_t n = beams_state.common_prefix_length) {
callback_data.response.resize(callback_data.response.size() + n);
assert(0u < beams_state.n_beams);
const llama_token * tokens = beams_state.beam_views[0].tokens;
std::copy(tokens, tokens + n, callback_data.response.end() - n);
printf("%zu", n);
}
fflush(stdout);
#if 1 // DEBUG: print current beams for this iteration
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
}
#endif
}
int main(int argc, char ** argv)
{
gpt_params params;
//params.n_gpu_layers = 200;
//---------------------------------
// Print help :
//---------------------------------
if ( argc < 2 || argv[1][0] == '-' )
{
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
return 1 ;
}
//---------------------------------
// Load parameters :
//---------------------------------
params.model = argv[1];
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
if ( argc > 3 )
{
params.prompt = argv[3];
}
if ( params.prompt.empty() )
{
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
}
//---------------------------------
// Init LLM :
//---------------------------------
llama_backend_init(params.numa);
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params( params );
if ( model == NULL )
{
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
return 1;
}
//---------------------------------
// Tokenize the prompt :
//---------------------------------
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
const size_t max_context_size = llama_n_ctx( ctx );
const size_t max_tokens_list_size = max_context_size - 4 ;
if (tokens_list.size() > max_tokens_list_size)
{
fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
__func__ , tokens_list.size() , max_tokens_list_size );
return 1;
}
fprintf( stderr, "\n\n" );
// Print the tokens from the prompt :
for( auto id : tokens_list )
{
std::cout << llama_token_to_piece(ctx, id);
}
std::cout << std::flush;
int n_past = llama_get_kv_cache_token_count(ctx);
if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
{
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
return 1;
}
n_past += tokens_list.size();
beam_search_callback_data callback_data{ctx, {}};
size_t const beam_width = static_cast<size_t>(params.n_beams);
int const n_predict = 256;
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads);
std::cout << "\n\n";
for (llama_token const token_id : callback_data.response) {
std::cout << llama_token_to_piece(ctx,token_id);
}
std::cout << std::endl;
llama_free( ctx );
llama_free_model( model );
llama_backend_free();
return 0;
}

View File

@@ -1,8 +0,0 @@
set(TARGET benchmark)
add_executable(${TARGET} benchmark-matmult.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,7 +1,13 @@
#include "ggml.h"
#include "build-info.h"
/*
License: MIT License
Changelog:
- 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
*/
#include <locale.h>
#include "ggml.h"
#include <assert.h>
#include <math.h>
#include <cstring>
@@ -16,22 +22,7 @@
#include <iterator>
#include <algorithm>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}
ggml_graph_compute(graph, &plan);
}
float tensor_sum_elements(const ggml_tensor * tensor) {
float tensor_sum_elements(struct ggml_tensor * tensor) {
float sum = 0;
if (tensor->type==GGML_TYPE_F32) {
for (int j = 0; j < tensor->ne[1]; j++) {
@@ -43,15 +34,21 @@ float tensor_sum_elements(const ggml_tensor * tensor) {
return sum;
}
void tensor_dump(const ggml_tensor * tensor, const char * name) {
printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
tensor->type, ggml_type_name(tensor->type),
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
float sum = tensor_sum_elements(tensor);
printf("Sum of tensor %s is %6.2f\n", name, sum);
}
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
/*
These are mapping to unknown
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
GGML_TYPE_COUNT,
*/
#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
{ float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
struct benchmark_params_struct {
int32_t n_threads = 1;
@@ -69,6 +66,8 @@ void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct para
}
int main(int argc, char ** argv) {
struct benchmark_params_struct benchmark_params;
bool invalid_param = false;
@@ -92,17 +91,19 @@ int main(int argc, char ** argv) {
print_usage(argc, argv, benchmark_params);
exit(0);
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
print_usage(argc, argv, benchmark_params);
exit(1);
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
print_usage(argc, argv, benchmark_params);
exit(1);
}
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
printf("Starting Test\n");
// create the ggml context
printf("Starting Test\n");
struct ggml_context * ctx;
//const int sizex = 4096;
//const int sizey = 11008;
@@ -124,18 +125,16 @@ int main(int argc, char ** argv) {
#endif
//printf("Memsize required = %i\n", sizex*sizex);
ggml_type wtype = GGML_TYPE_F32;
size_t ctx_size = 0;
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
ctx_size += 1024*1024*16;
ctx_size += sizex*sizeof(float);
ctx_size += 1024*1024*100;
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
@@ -146,7 +145,7 @@ int main(int argc, char ** argv) {
ctx = ggml_init(params);
if (!ctx) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return 1;
return false;
}
@@ -170,14 +169,13 @@ int main(int argc, char ** argv) {
// printf("Creating compute graph\n");
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
printf("n_threads=%i\n", benchmark_params.n_threads);
gf.n_threads=benchmark_params.n_threads;
printf("cgraph->n_threads=%i\n",gf.n_threads);
TENSOR_DUMP(m11);
TENSOR_DUMP(m2);
std::vector<uint8_t> work_buffer;
ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
ggml_graph_compute(ctx, &gf);
TENSOR_DUMP(gf.nodes[0]);
@@ -199,6 +197,7 @@ int main(int argc, char ** argv) {
// printf("Creating compute graph\n");
struct ggml_cgraph gf31 = ggml_build_forward(q31);
gf31.n_threads=benchmark_params.n_threads;
// Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n");
@@ -210,38 +209,38 @@ int main(int argc, char ** argv) {
//printf("Creating compute graph\n");
struct ggml_cgraph gf32 = ggml_build_forward(q32);
printf("n_threads=%i\n", benchmark_params.n_threads);
gf32.n_threads=benchmark_params.n_threads;
printf("cgraph->n_threads=%i\n",gf31.n_threads);
const int dimx = sizex;
const int dimy = sizey;
const int dimz = sizez;
long long int flops_per_dot_product = dimy + dimy;
long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
// Let's use the F32 result from above as a reference for the q4_0 multiplication
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
printf("=====================================================================================\n");
double gflops_sum = 0;
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
printf("==============================================================================================\n");
for (int i=0;i<benchmark_params.n_iterations ;i++) {
long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n");
ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
ggml_graph_compute(ctx, &gf31);
long long int stop = ggml_time_us();
long long int usec = stop-start;
double gflops = (double)(flops_per_matrix)/usec/1000.0;
gflops_sum += gflops;
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
float sec = usec/1000000;
float flops_per_usec = (1.0f*flops_per_matrix)/usec;
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
i,
benchmark_params.n_threads,
gf31.n_threads,
sizex, sizey, sizez, flops_per_matrix,
usec,gflops);
usec,flops_per_usec);
#ifdef VERBOSE_DEBUGGING
TENSOR_DUMP("res",gf31.nodes[0])
@@ -264,9 +263,8 @@ int main(int argc, char ** argv) {
}
// Running a different graph computation to make sure we override the CPU cache lines
ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
ggml_graph_compute(ctx, &gf32);
}
printf("\n");
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
printf("=====================================================================================\n");
}

View File

@@ -1,12 +1,9 @@
#!/bin/bash
set -e
cd "$(dirname "$0")/.." || exit
MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
USER_NAME="${USER_NAME:-USER}"
USER_NAME="${USER_NAME:-User}"
AI_NAME="${AI_NAME:-ChatLLaMa}"
# Adjust to the number of CPU cores you want to use.
@@ -18,24 +15,39 @@ N_PREDICTS="${N_PREDICTS:-2048}"
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
DATE_TIME=$(date +%H:%M)
DATE_YEAR=$(date +%Y)
PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./main $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \
--color --interactive \
--file ${PROMPT_FILE} \
--reverse-prompt "${USER_NAME}:" \
--in-prefix ' ' \
"$@"
--prompt "
Text transcript of a never ending dialog, where ${USER_NAME} interacts with an AI assistant named ${AI_NAME}.
${AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer ${USER_NAME}s requests immediately and with details and precision.
There are no annotations like (30 seconds passed...) or (to himself), just what ${USER_NAME} and ${AI_NAME} say aloud to each other.
The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
The transcript only includes text, it does not include markup like HTML and Markdown.
$USER_NAME: Hello, $AI_NAME!
$AI_NAME: Hello $USER_NAME! How may I help you today?
$USER_NAME: What time is it?
$AI_NAME: It is $(date +%H:%M).
$USER_NAME: What year is it?
$AI_NAME: We are in $(date +%Y).
$USER_NAME: Please tell me the largest city in Europe.
$AI_NAME: The largest city in Europe is Moscow, the capital of Russia.
$USER_NAME: What can you tell me about Moscow?
$AI_NAME: Moscow, on the Moskva River in western Russia, is the nations cosmopolitan capital. In its historic core is the Kremlin, a complex thats home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russias symbolic center.
$USER_NAME: What is a cat?
$AI_NAME: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
$USER_NAME: How do I pass command line arguments to a Node.js program?
$AI_NAME: The arguments are stored in process.argv.
argv[0] is the path to the Node. js executable.
argv[1] is the path to the script file.
argv[2] is the first argument passed to the script.
argv[3] is the second argument passed to the script and so on.
$USER_NAME: Name a color.
$AI_NAME: Blue
$USER_NAME:" "$@"

View File

@@ -1,151 +0,0 @@
#!/bin/bash
set -euo pipefail
cd "$(dirname "$0")/.." || exit
if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
echo >&2 "error: PROMPT_CACHE_FILE and CHAT_SAVE_DIR must be provided"
exit 1
fi
MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
USER_NAME="${USER_NAME:-User}"
AI_NAME="${AI_NAME:-ChatLLaMa}"
DATE_TIME="$(date +%H:%M)"
DATE_YEAR="$(date +%Y)"
LOG="${CHAT_SAVE_DIR}/main.log"
LOG_BG="${CHAT_SAVE_DIR}/main-bg.log"
CUR_PROMPT_FILE="${CHAT_SAVE_DIR}/current-prompt.txt"
CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
CTX_SIZE=2048
CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
# An unbuffered `tail -c+N`
skip_bytes() {
LANG=C IFS= read -r -n "$1" -d '' c
while LANG=C IFS= read -r -n 1 -d '' c; do
printf '%s' "$c"
done
}
mkdir -p "$CHAT_SAVE_DIR"
echo >"$LOG"
trap "tail -n100 ${LOG}" EXIT
if [[ ! -e "$CUR_PROMPT_FILE" ]]; then
sed -e "s/\[\[USER_NAME\]\]/${USER_NAME}/g" \
-e "s/\[\[AI_NAME\]\]/${AI_NAME}/g" \
-e "s/\[\[DATE_TIME\]\]/${DATE_TIME}/g" \
-e "s/\[\[DATE_YEAR\]\]/${DATE_YEAR}/g" \
"$PROMPT_TEMPLATE" >"$CUR_PROMPT_FILE"
fi
if [[ ! -e "$NEXT_PROMPT_FILE" ]]; then
sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
fi
if [[ "$(tail -c4 "$NEXT_PROMPT_FILE")" != "..." ]]; then
echo '...' >>"$NEXT_PROMPT_FILE"
fi
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
echo 'Prompt cache does not exist, building...'
# Default batch_size to 8 here for better user feedback during initial prompt processing
./main 2>>"$LOG" \
--batch_size 8 \
"${OPTS[@]}" \
--prompt-cache "$PROMPT_CACHE_FILE" \
--file "$CUR_PROMPT_FILE" \
--n_predict 1
echo
echo 'Done!'
fi
if [[ ! -e "$CUR_PROMPT_CACHE" ]]; then
cp "$PROMPT_CACHE_FILE" "$CUR_PROMPT_CACHE"
fi
if [[ ! -e "$NEXT_PROMPT_CACHE" ]]; then
cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
fi
printf '%s ' "$(< "$CUR_PROMPT_FILE")"
n_tokens=0
while read -e line; do
# Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
# Swap prompts when we're about to run out of context
if ((n_predict <= 0)); then
wait # for background main (below) to finish with next prompt
mv "$NEXT_PROMPT_FILE" "$CUR_PROMPT_FILE"
mv "$NEXT_PROMPT_CACHE" "$CUR_PROMPT_CACHE"
sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
echo '...' >>"$NEXT_PROMPT_FILE"
cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
n_tokens=0
n_predict=$((CTX_SIZE / 2))
fi
echo " ${line}" >>"$CUR_PROMPT_FILE"
if ((n_tokens > CTX_ROTATE_POINT)); then
echo " ${line}" >>"$NEXT_PROMPT_FILE"
fi
n_prompt_len_pre=$(($(wc -c <"$CUR_PROMPT_FILE")))
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
./main 2>>"$LOG" "${OPTS[@]}" \
--prompt-cache "$CUR_PROMPT_CACHE" \
--prompt-cache-all \
--file "$CUR_PROMPT_FILE" \
--reverse-prompt "${USER_NAME}:" \
--n_predict "$n_predict" |
skip_bytes 1 | # skip BOS token added by ./main
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
skip_bytes "$n_prompt_len_pre" # print generation
mv "$CUR_PROMPT_FILE.tmp" "$CUR_PROMPT_FILE"
# if we hit n_predict instead of reverse-prompt, we need to add the prompt
if [[ "$(tail -n1 "$CUR_PROMPT_FILE")" != "${USER_NAME}:" ]]; then
printf '\n%s:' "$USER_NAME"
printf '\n%s:' "$USER_NAME" >> "$CUR_PROMPT_FILE"
fi
printf ' '
# HACK get num tokens from debug message
# TODO get both messages in one go
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
! sample_time_msg="$( tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
echo >&2 "Couldn't get number of tokens from ./main output!"
exit 1
fi
n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
if ((n_tokens > CTX_ROTATE_POINT)); then
tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
fi
# Update cache for next prompt in background, ideally during user input
./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
--prompt-cache "$NEXT_PROMPT_CACHE" \
--file "$NEXT_PROMPT_FILE" \
--n_predict 1 &
done

View File

@@ -1,41 +0,0 @@
#!/bin/bash
set -e
cd "$(dirname "$0")/.." || exit
MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
USER_NAME="### Human"
AI_NAME="### Assistant"
# Adjust to the number of CPU cores you want to use.
N_THREAD="${N_THREAD:-8}"
# Number of tokens to predict (made it larger than default because we want a long interaction)
N_PREDICTS="${N_PREDICTS:-2048}"
# Note: you can also override the generation options by specifying them on the command line:
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
DATE_TIME=$(date +%H:%M)
DATE_YEAR=$(date +%Y)
PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./bin/main $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \
--color --interactive \
--file ${PROMPT_FILE} \
--reverse-prompt "### Human:" \
--in-prefix ' ' \
"$@"

View File

@@ -11,6 +11,6 @@ cd ..
#
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
#
./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
./main -m ./models/7B/ggml-model-q4_0.bin -c 512 -b 1024 -n 256 --keep 48 \
--repeat_penalty 1.0 --color -i \
-r "User:" -f prompts/chat-with-bob.txt

345
examples/common.cpp Normal file
View File

@@ -0,0 +1,345 @@
#include "common.h"
#include <cassert>
#include <cstring>
#include <fstream>
#include <string>
#include <iterator>
#include <algorithm>
#if defined (_WIN32)
#include <fcntl.h>
#include <io.h>
#pragma comment(lib,"kernel32.lib")
extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags,
const wchar_t * lpWideCharStr, int cchWideChar,
char * lpMultiByteStr, int cbMultiByte,
const char * lpDefaultChar, bool * lpUsedDefaultChar);
#define CP_UTF8 65001
#endif
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
// determine sensible default number of threads.
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
#ifdef __linux__
std::ifstream cpuinfo("/proc/cpuinfo");
params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
std::istream_iterator<std::string>(),
std::string("processor"));
#endif
if (params.n_threads == 0) {
params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
}
bool invalid_param = false;
std::string arg;
gpt_params default_params;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-s" || arg == "--seed") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.seed = std::stoi(argv[i]);
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_threads = std::stoi(argv[i]);
} else if (arg == "-p" || arg == "--prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.prompt = argv[i];
} else if (arg == "-f" || arg == "--file") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
if (params.prompt.back() == '\n') {
params.prompt.pop_back();
}
} else if (arg == "-n" || arg == "--n_predict") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_predict = std::stoi(argv[i]);
} else if (arg == "--top_k") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.top_k = std::stoi(argv[i]);
} else if (arg == "-c" || arg == "--ctx_size") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_ctx = std::stoi(argv[i]);
} else if (arg == "--memory_f32") {
params.memory_f16 = false;
} else if (arg == "--top_p") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.top_p = std::stof(argv[i]);
} else if (arg == "--temp") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.temp = std::stof(argv[i]);
} else if (arg == "--repeat_last_n") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.repeat_last_n = std::stoi(argv[i]);
} else if (arg == "--repeat_penalty") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.repeat_penalty = std::stof(argv[i]);
} else if (arg == "-b" || arg == "--batch_size") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_batch = std::stoi(argv[i]);
params.n_batch = std::min(512, params.n_batch);
} else if (arg == "--keep") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_keep = std::stoi(argv[i]);
} else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.model = argv[i];
} else if (arg == "--lora") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_adapter = argv[i];
params.use_mmap = false;
} else if (arg == "--lora-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_base = argv[i];
} else if (arg == "-i" || arg == "--interactive") {
params.interactive = true;
} else if (arg == "--embedding") {
params.embedding = true;
} else if (arg == "--interactive-start") {
params.interactive = true;
} else if (arg == "--interactive-first") {
params.interactive_start = true;
} else if (arg == "-ins" || arg == "--instruct") {
params.instruct = true;
} else if (arg == "--color") {
params.use_color = true;
} else if (arg == "--mlock") {
params.use_mlock = true;
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--mtest") {
params.mem_test = true;
} else if (arg == "--verbose-prompt") {
params.verbose_prompt = true;
} else if (arg == "-r" || arg == "--reverse-prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.antiprompt.push_back(argv[i]);
} else if (arg == "--perplexity") {
params.perplexity = true;
} else if (arg == "--ignore-eos") {
params.ignore_eos = true;
} else if (arg == "--n_parts") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_parts = std::stoi(argv[i]);
} else if (arg == "-h" || arg == "--help") {
gpt_print_usage(argc, argv, default_params);
exit(0);
} else if (arg == "--random-prompt") {
params.random_prompt = true;
} else if (arg == "--in-prefix") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.input_prefix = argv[i];
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, default_params);
exit(1);
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, default_params);
exit(1);
}
return true;
}
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -i, --interactive run in interactive mode\n");
fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n");
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
fprintf(stderr, " specified more than once for multiple prompts).\n");
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: empty)\n");
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stderr, " -f FNAME, --file FNAME\n");
fprintf(stderr, " prompt file to start generation.\n");
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", (double)params.top_p);
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty);
fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n");
fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
if (llama_mlock_supported()) {
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
if (llama_mmap_supported()) {
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");
}
std::string gpt_random_prompt(std::mt19937 & rng) {
const int r = rng() % 10;
switch (r) {
case 0: return "So";
case 1: return "Once upon a time";
case 2: return "When";
case 3: return "The";
case 4: return "After";
case 5: return "If";
case 6: return "import";
case 7: return "He";
case 8: return "She";
case 9: return "They";
default: return "To";
}
return "The";
}
// TODO: not great allocating this every time
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
std::vector<llama_token> res(text.size() + (int)add_bos);
int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
assert(n >= 0);
res.resize(n);
return res;
}
/* Keep track of current color of output, and emit ANSI code if it changes. */
void set_console_color(console_state & con_st, console_color_t color) {
if (con_st.use_color && con_st.color != color) {
switch(color) {
case CONSOLE_COLOR_DEFAULT:
printf(ANSI_COLOR_RESET);
break;
case CONSOLE_COLOR_PROMPT:
printf(ANSI_COLOR_YELLOW);
break;
case CONSOLE_COLOR_USER_INPUT:
printf(ANSI_BOLD ANSI_COLOR_GREEN);
break;
}
con_st.color = color;
}
}
#if defined (_WIN32)
void win32_console_init(bool enable_color) {
unsigned long dwMode = 0;
void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
if (!hConOut || hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode)) {
hConOut = GetStdHandle((unsigned long)-12); // STD_ERROR_HANDLE (-12)
if (hConOut && (hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode))) {
hConOut = 0;
}
}
if (hConOut) {
// Enable ANSI colors on Windows 10+
if (enable_color && !(dwMode & 0x4)) {
SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
}
// Set console output codepage to UTF8
SetConsoleOutputCP(CP_UTF8);
}
void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) {
// Set console input codepage to UTF16
_setmode(_fileno(stdin), _O_WTEXT);
}
}
// Convert a wide Unicode string to an UTF8 string
void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
std::string strTo(size_needed, 0);
WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
str = strTo;
}
#endif

98
examples/common.h Normal file
View File

@@ -0,0 +1,98 @@
// Various helper functions and utilities
#pragma once
#include "llama.h"
#include <string>
#include <vector>
#include <random>
#include <thread>
//
// CLI argument parsing
//
struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t n_predict = 128; // new tokens to predict
int32_t repeat_last_n = 64; // last n tokens to penalize
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
// sampling parameters
int32_t top_k = 40;
float top_p = 0.95f;
float temp = 0.80f;
float repeat_penalty = 1.10f;
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
std::string prompt = "";
std::string input_prefix = ""; // string to prefix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string lora_adapter = ""; // lora adapter path
std::string lora_base = ""; // base model path for the lora adapter
bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool embedding = false; // get only sentence embedding
bool interactive_start = false; // wait for user input immediately
bool instruct = false; // instruction mode (used for Alpaca models)
bool ignore_eos = false; // do not stop generating after eos
bool perplexity = false; // compute perplexity over the prompt
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool verbose_prompt = false; // print prompt tokens before generation
};
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
//
// Vocab utils
//
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
//
// Console utils
//
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_YELLOW "\x1b[33m"
#define ANSI_COLOR_BLUE "\x1b[34m"
#define ANSI_COLOR_MAGENTA "\x1b[35m"
#define ANSI_COLOR_CYAN "\x1b[36m"
#define ANSI_COLOR_RESET "\x1b[0m"
#define ANSI_BOLD "\x1b[1m"
enum console_color_t {
CONSOLE_COLOR_DEFAULT=0,
CONSOLE_COLOR_PROMPT,
CONSOLE_COLOR_USER_INPUT
};
struct console_state {
bool use_color = false;
console_color_t color = CONSOLE_COLOR_DEFAULT;
};
void set_console_color(console_state & con_st, console_color_t color);
#if defined (_WIN32)
void win32_console_init(bool enable_color);
void win32_utf8_encode(const std::wstring & wstr, std::string & str);
#endif

View File

@@ -1,5 +0,0 @@
set(TARGET convert-llama2c-to-ggml)
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,26 +0,0 @@
## Convert llama2.c model to ggml
This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
`$ make -j`
After successful compilation, following usage options are available:
```
usage: ./convert-llama2c-to-ggml [options]
options:
-h, --help show this help message and exit
--copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf')
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
```
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
Now you can use the model with a command like:
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`

View File

@@ -1,961 +0,0 @@
#include "ggml.h"
#include "llama.h"
#include "common.h"
#include <unordered_map>
#include <vector>
#include <cassert>
#include <climits>
#include <cstring>
#include <cstdarg>
#include <ctime>
#include <random>
#include <stdexcept>
#include <sstream>
#include <algorithm>
#include <string>
// GGUF keys & tensor names.
#define KV_GENERAL_ARCHITECTURE "general.architecture"
#define KV_GENERAL_NAME "general.name"
#define KV_TOKENIZER_MODEL "tokenizer.ggml.model"
#define KV_TOKENIZER_LIST "tokenizer.ggml.tokens"
#define KV_TOKENIZER_TOKEN_TYPE "tokenizer.ggml.token_type"
#define KV_TOKENIZER_SCORES "tokenizer.ggml.scores"
#define KV_TOKENIZER_BOS_ID "tokenizer.ggml.bos_token_id"
#define KV_TOKENIZER_EOS_ID "tokenizer.ggml.eos_token_id"
#define KV_TOKENIZER_UNK_ID "tokenizer.ggml.unknown_token_id"
#define KV_TOKENIZER_SEP_ID "tokenizer.ggml.seperator_token_id"
#define KV_TOKENIZER_PAD_ID "tokenizer.ggml.padding_token_id"
#define KV_TOKENIZER_HF_JSON "tokenizer.huggingface.json"
#define KV_CONTEXT_LENGTH "llama.context_length"
#define KV_EMBEDDING_LENGTH "llama.embedding_length"
#define KV_BLOCK_COUNT "llama.block_count"
#define KV_FEED_FORWARD_LENGTH "llama.feed_forward_length"
#define KV_ATTENTION_HEAD_COUNT "llama.attention.head_count"
#define KV_ATTENTION_HEAD_COUNT_KV "llama.attention.head_count_kv"
#define KV_ATTENTION_LAYERNORM_RMS_EPS "llama.attention.layer_norm_rms_epsilon"
#define KV_ROPE_DIMENSION_COUNT "llama.rope.dimension_count"
#define TN_TOKEN_EMBD "token_embd.weight"
#define TN_OUTPUT_NORM "output_norm.weight"
#define TN_OUTPUT "output.weight"
#define TN_ATTN_NORM "blk.%d.attn_norm.weight"
#define TN_ATTN_Q "blk.%d.attn_q.weight"
#define TN_ATTN_K "blk.%d.attn_k.weight"
#define TN_ATTN_V "blk.%d.attn_v.weight"
#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight"
#define TN_FFN_NORM "blk.%d.ffn_norm.weight"
#define TN_FFN_GATE "blk.%d.ffn_gate.weight"
#define TN_FFN_DOWN "blk.%d.ffn_down.weight"
#define TN_FFN_UP "blk.%d.ffn_up.weight"
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
#define LLAMA_FILE_VERSION_GGJT_V3 3
#define TOKENIZER_NAME "llama"
#define UNKNOWN_TOKEN_ID 0
#define BOS_TOKEN_ID 1
#define EOS_TOKEN_ID 2
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
typedef struct {
int dim; // transformer dimension
int hidden_dim; // for ffn layers
int n_layers; // number of layers
int n_heads; // number of query heads
int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
int vocab_size; // vocabulary size, usually 256 (byte-level)
int seq_len; // max sequence length
} Config;
struct TransformerWeights {
// token embedding table
float* token_embedding_table; // (vocab_size, dim)
// weights for rmsnorms
float* rms_att_weight; // (layer, dim) rmsnorm weights
float* rms_ffn_weight; // (layer, dim)
// weights for matmuls
float* wq; // (layer, dim, dim)
float* wk; // (layer, dim, dim)
float* wv; // (layer, dim, dim)
float* wo; // (layer, dim, dim)
// weights for ffn
float* w1; // (layer, hidden_dim, dim)
float* w2; // (layer, dim, hidden_dim)
float* w3; // (layer, hidden_dim, dim)
// final rmsnorm
float* rms_final_weight; // (dim,)
// freq_cis for RoPE relatively positional embeddings
// float* freq_cis_real; // (seq_len, dim/2)
// float* freq_cis_imag; // (seq_len, dim/2)
// (optional) classifier weights for the logits, on the last layer
float* wcls;
~TransformerWeights() {
delete[] token_embedding_table;
delete[] rms_att_weight;
delete[] rms_ffn_weight;
delete[] wq;
delete[] wk;
delete[] wv;
delete[] wo;
delete[] w1;
delete[] w2;
delete[] w3;
delete[] rms_final_weight;
delete[] wcls;
}
};
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
// we calloc instead of malloc to keep valgrind happy
w->token_embedding_table = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
w->rms_att_weight = new float[p->n_layers * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
w->rms_ffn_weight = new float[p->n_layers * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
w->wq = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wk = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wv = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wo = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->rms_final_weight = new float[p->dim]();
printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
if (shared_weights) {
w->wcls = NULL;
} else {
w->wcls = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
}
}
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
// Skip freq_cis_real & freq_cis_imag
int head_size = p->dim / p->n_heads;
fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
// Check we didn't forget to read anything
auto curr = ftell(f);
fseek(f, 0, SEEK_END);
auto end = ftell(f);
if (curr != end) {
printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end);
return 1;
}
return 0;
}
void print_sample_weights(TransformerWeights *w){
printf("----- Quick print of first of the weight vales of all the variables\n");
printf("%f\n", w->token_embedding_table[0]);
printf("%f\n", w->rms_att_weight[0]);
printf("%f\n", w->rms_ffn_weight[0]);
printf("%f\n", w->wq[0]);
printf("%f\n", w->wk[0]);
printf("%f\n", w->wv[0]);
printf("%f\n", w->wo[0]);
printf("%f\n", w->w1[0]);
printf("%f\n", w->w2[0]);
printf("%f\n", w->w3[0]);
printf("%f\n", w->rms_att_weight[0]);
if (w->wcls) printf("%f\n", w->wcls[0]);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
struct llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;
struct token_data {
token text;
float score;
ttype type;
};
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
};
struct my_llama_hparams {
uint32_t n_vocab = 32000;
uint32_t n_ctx = 512; // this is provided as user input?
uint32_t n_embd = 4096;
uint32_t n_ff = 11008;
uint32_t n_mult = 4;
uint32_t n_head = 32;
uint32_t n_layer = 32;
uint32_t n_rot = 64;
bool operator!=(const my_llama_hparams& other) const {
return memcmp(this, &other, sizeof(my_llama_hparams));
}
};
struct my_llama_layer {
// normalization
struct ggml_tensor * attention_norm;
// attention
struct ggml_tensor * wq;
struct ggml_tensor * wk;
struct ggml_tensor * wv;
struct ggml_tensor * wo;
// normalization
struct ggml_tensor * ffn_norm;
// ff
struct ggml_tensor * w1;
struct ggml_tensor * w2;
struct ggml_tensor * w3;
};
struct my_llama_model {
struct ggml_context * ctx = NULL;
std::string name;
my_llama_hparams hparams;
struct ggml_tensor * tok_embeddings;
struct ggml_tensor * norm;
struct ggml_tensor * output;
std::vector<my_llama_layer> layers;
uint32_t train_its = 0;
uint32_t train_samples = 0;
uint32_t train_tokens = 0;
};
struct train_params {
const char * fn_vocab_model;
const char * fn_llama2c_model;
const char * fn_llama2c_output_model;
const char * fn_train_data;
const char * fn_checkpoint_in;
const char * fn_checkpoint_out;
const char * fn_model_out;
uint32_t seed;
int n_ctx;
int n_embd;
int n_mult;
int n_head;
int n_layer;
int n_rotmax;
int n_threads;
int n_batch;
int n_examples;
int n_predict;
int print_info_interval;
int print_details_interval;
bool samples_start_after_nl;
bool use_adam;
bool use_flash;
bool use_scratch;
// only adam
int warmup;
int cos_decay_steps;
float cos_decay_restart;
float cos_decay_alpha;
int lbfgs_n_iter;
int adam_n_iter;
float adam_alpha;
float adam_decay;
int mem_model_gb;
int mem_compute_gb;
int mem_compute0_gb;
int mem_compute1_gb;
};
void print_params(struct my_llama_hparams * params) {
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
printf("%s: n_embd: %d\n", __func__, params->n_embd);
printf("%s: n_mult: %d\n", __func__, params->n_mult);
printf("%s: n_head: %d\n", __func__, params->n_head);
printf("%s: n_ff: %d\n", __func__, params->n_ff);
printf("%s: n_layer: %d\n", __func__, params->n_layer);
printf("%s: n_rot: %d\n", __func__, params->n_rot);
}
void init_model(struct my_llama_model * model) {
const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd;
const uint32_t n_layer = hparams.n_layer;
const uint32_t n_vocab = hparams.n_vocab;
const uint32_t n_ff = hparams.n_ff;
struct ggml_context * ctx = model->ctx;
model->train_its = 0;
model->train_samples = 0;
model->train_tokens = 0;
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
// printing the per-layer allocations here so we dont print in the for loop.
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
ggml_set_name(model->norm, "norm.weight");
ggml_set_name(model->output, "output.weight");
model->layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = model->layers[i];
std::string layers_i = "layers." + std::to_string(i);
layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
}
}
float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr;
}
int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr;
}
void print_row(struct ggml_tensor * probs, int i) {
for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i);
printf(" %f", p);
}
printf("\n");
}
void print_matrix(struct ggml_tensor * probs) {
assert(probs->n_dims == 2);
for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i);
printf(" %.2f", p);
}
printf("\n");
}
}
#ifdef __GNUC__
#ifdef __MINGW32__
__attribute__((format(gnu_printf, 1, 2)))
#else
__attribute__((format(printf, 1, 2)))
#endif
#endif
static std::string format(const char * fmt, ...) {
va_list ap, ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX);
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
if (fp == NULL) {
size = 0;
} else {
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}
}
size_t tell() const {
#ifdef _WIN32
__int64 ret = _ftelli64(fp);
#else
long ret = std::ftell(fp);
#endif
GGML_ASSERT(ret != -1); // this really shouldn't fail
return (size_t) ret;
}
void seek(size_t offset, int whence) {
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, whence);
#else
int ret = std::fseek(fp, (long) offset, whence);
#endif
GGML_ASSERT(ret == 0); // same
}
void read_raw(void * ptr, size_t size) {
if (size == 0) {
return;
}
errno = 0;
std::size_t ret = std::fread(ptr, size, 1, fp);
if (ferror(fp)) {
die_fmt("fread failed: %s", strerror(errno));
}
if (ret != 1) {
die("unexpectedly reached end of file");
}
}
std::uint32_t read_u32() {
std::uint32_t ret;
read_raw(&ret, sizeof(ret));
return ret;
}
std::float_t read_f32() {
std::float_t ret;
read_raw(&ret, sizeof(ret));
return ret;
}
std::string read_string(std::uint32_t len) {
std::vector<char> chars(len);
read_raw(chars.data(), len);
return std::string(chars.data(), len);
}
~llama_file() {
if (fp) {
std::fclose(fp);
}
}
};
bool is_ggml_file(const char *filename) {
llama_file file(filename, "rb");
if (file.size < 4) {
return false;
}
uint32_t magic = file.read_u32();
return magic == GGUF_MAGIC;
}
static std::string llama_escape_whitespaces(const std::string& text) {
std::ostringstream out;
for (char c : text) {
if (c == ' ') out << "\xe2\x96\x81";
else out << c;
}
return out.str();
}
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
if (is_ggml_file(filename)) {
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
};
struct gguf_context * ctx = gguf_init_from_file(filename, params);
GGML_ASSERT(ctx != NULL);
const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL);
GGML_ASSERT(model_idx >= 0);
std::string tokenizer_name = gguf_get_val_str(ctx, model_idx);
GGML_ASSERT(tokenizer_name == TOKENIZER_NAME);
const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST);
GGML_ASSERT(token_idx >= 0);
const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES);
GGML_ASSERT(score_idx >= 0);
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE);
GGML_ASSERT(toktype_idx >= 0);
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
vocab->id_to_token.resize(n_vocab);
for (uint32_t i = 0; i < n_vocab; i++) {
std::string word = gguf_get_arr_str(ctx, token_idx, i);
vocab->token_to_id[word] = i;
auto & token_data = vocab->id_to_token[i];
token_data.text = std::move(word);
token_data.score = scores[i];
token_data.type = (llama_token_type) toktypes[i];
}
ggml_free(ctx_data);
gguf_free(ctx);
} else {
// assume llama2.c vocabulary
printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
llama_file file(filename, "rb");
if (!file.fp) {
die_fmt("%s: %s", strerror(errno), filename);
}
const int n_vocab = config->vocab_size;
/* uint32_t max_token_length = */ file.read_u32(); // unused
vocab->id_to_token.resize(n_vocab);
for (llama_vocab::id id=0; id<n_vocab; ++id) {
float_t score = file.read_f32();
uint32_t len = file.read_u32();
std::string text = file.read_string(len);
unsigned char byte_val;
llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
if (id == UNKNOWN_TOKEN_ID) {
text = "<unk>";
type = LLAMA_TOKEN_TYPE_UNKNOWN;
} else if (id == BOS_TOKEN_ID) {
text = "<s>";
type = LLAMA_TOKEN_TYPE_CONTROL;
} else if (id == EOS_TOKEN_ID) {
text = "</s>";
type = LLAMA_TOKEN_TYPE_CONTROL;
} else if (text.empty()) {
type = LLAMA_TOKEN_TYPE_CONTROL;
} else if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
// Text of byte tokens is already in the expected format.
type = LLAMA_TOKEN_TYPE_BYTE;
} else {
type = LLAMA_TOKEN_TYPE_NORMAL;
}
text = llama_escape_whitespaces(text);
vocab->id_to_token[id].text = text;
vocab->id_to_token[id].score = score;
vocab->id_to_token[id].type = type;
vocab->token_to_id.emplace(text, id);
}
}
}
void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
int ct;
switch (gg_weights->n_dims){
case 1:
ct = 0;
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
*ptr = karpathy_weights[ct];
ct++;
}
break;
case 2:
ct = 0;
for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
*ptr = karpathy_weights[ct];
ct++;
}
}
break;
case 3:
ct = 0;
for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
*ptr = karpathy_weights[ct];
ct++;
}
}
}
break;
}
}
void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
// convert AK weights into GG weights one by one.
// w->token_embedding_table -> model->tok_embeddings
// float* -> struct ggml_tensor
convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
//print_row(model->norm, 0);
// for rms-att-weight
int row_length = model->hparams.n_embd;
int n_ff = model->hparams.n_ff;
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
auto & layer = model->layers[i];
// 1d
convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
convert_weights_ak_to_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
convert_weights_ak_to_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
}
struct gguf_context * ctx = gguf_init_empty();
std::vector<const char*> tokens;
std::vector<float> scores;
std::vector<llama_token_type> token_types;
for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
tokens.push_back(token_data.text.c_str());
scores.push_back(token_data.score);
token_types.push_back(token_data.type);
}
gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size());
gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size());
gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size());
gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME);
gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama");
gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama");
// special tokens
gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
// n_head_kv is optional, default to n_head
// gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
// write tensors
ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD);
gguf_add_tensor(ctx, model->tok_embeddings);
ggml_set_name(model->norm, TN_OUTPUT_NORM);
gguf_add_tensor(ctx, model->norm);
ggml_set_name(model->output, TN_OUTPUT);
gguf_add_tensor(ctx, model->output);
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
auto & layer = model->layers[i];
ggml_format_name(layer.wq, TN_ATTN_Q, i);
gguf_add_tensor(ctx, layer.wq);
ggml_format_name(layer.wk, TN_ATTN_K, i);
gguf_add_tensor(ctx, layer.wk);
ggml_format_name(layer.wv, TN_ATTN_V, i);
gguf_add_tensor(ctx, layer.wv);
ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i);
gguf_add_tensor(ctx, layer.wo);
ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i);
gguf_add_tensor(ctx, layer.attention_norm);
ggml_format_name(layer.w1, TN_FFN_GATE, i);
gguf_add_tensor(ctx, layer.w1);
ggml_format_name(layer.w2, TN_FFN_DOWN, i);
gguf_add_tensor(ctx, layer.w2);
ggml_format_name(layer.w3, TN_FFN_UP, i);
gguf_add_tensor(ctx, layer.w3);
ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i);
gguf_add_tensor(ctx, layer.ffn_norm);
}
gguf_write_to_file(ctx, filename, false);
gguf_free(ctx);
}
struct train_params get_default_train_params() {
struct train_params params;
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
params.fn_llama2c_output_model = "ak_llama_model.bin";
params.fn_train_data = "shakespeare.txt";
params.fn_checkpoint_in = "checkpoint.bin";
params.fn_checkpoint_out = "checkpoint.bin";
params.fn_model_out = "ggml-checkpoint-f32.bin";
params.seed = -1;
params.n_ctx = 128;
params.n_embd = 256;
params.n_mult = 256;
params.n_head = 8;
params.n_layer = 16;
params.n_rotmax = 64;
params.n_threads = 6;
params.n_batch = 8;
params.n_examples = 8;
params.n_predict = 1024;
params.print_info_interval = 1;
params.print_details_interval = 2;
params.samples_start_after_nl = false;
params.use_adam = true;
params.use_flash = true;
params.use_scratch = true;
// only adam
params.warmup = 100;
params.cos_decay_steps = 1000;
params.cos_decay_restart = 1.1f;
params.cos_decay_alpha = 0.0f;
params.lbfgs_n_iter = 16;
params.adam_n_iter = 16;
params.adam_alpha = 1e-3f;
params.adam_decay = 1e-3f;
params.mem_model_gb = 2;
params.mem_compute_gb = 24;
params.mem_compute0_gb = 8;
params.mem_compute1_gb = 2;
return params;
}
void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model);
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
fprintf(stderr, "\n");
}
bool params_parse(int argc, char ** argv, struct train_params * params) {
bool invalid_param = false;
bool reqd_param_found = false;
std::string arg;
struct train_params default_params = get_default_train_params();
const std::string arg_prefix = "--";
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
if (arg == "--copy-vocab-from-model") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->fn_vocab_model = argv[i];
} else if (arg == "--llama2c-model") {
if (++i >= argc) {
invalid_param = true;
break;
}
reqd_param_found = true;
params->fn_llama2c_model = argv[i];
} else if (arg == "--llama2c-output-model") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->fn_llama2c_output_model = argv[i];
} else if (arg == "-h" || arg == "--help") {
print_usage(argc, argv, &default_params);
exit(0);
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
print_usage(argc, argv, &default_params);
exit(1);
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
print_usage(argc, argv, &default_params);
exit(1);
}
if (!reqd_param_found){
fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
print_usage(argc, argv, &default_params);
exit(1);
}
return true;
}
std::string basename(const std::string &path) {
size_t pos = path.find_last_of("/\\");
if (pos == std::string::npos) {
return path;
}
return path.substr(pos + 1);
}
int main(int argc, char ** argv) {
struct train_params params = get_default_train_params();
if (!params_parse(argc, argv, &params)) {
return 1;
}
Config config;
TransformerWeights weights = {};
{
FILE *file = fopen(params.fn_llama2c_model, "rb");
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
// read in the config header
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
auto shared_weights = config.vocab_size > 0;
config.vocab_size = abs(config.vocab_size);
// read in the Transformer weights
malloc_weights(&weights, &config, shared_weights);
if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
fclose(file);
}
struct llama_vocab vocab;
load_vocab(params.fn_vocab_model, &config, &vocab);
struct my_llama_model model;
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
model.hparams.n_ctx = params.n_ctx;
model.hparams.n_embd = config.dim; //params.n_embd;
model.hparams.n_ff = config.hidden_dim;
model.hparams.n_mult = 32;//params.n_mult;
model.hparams.n_head = config.n_heads; //params.n_head;
model.hparams.n_layer = config.n_layers; //params.n_layer;
model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
print_params(&model.hparams);
struct ggml_init_params lcparams;
lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
lcparams.mem_buffer = NULL;
lcparams.no_alloc = false;
model.ctx = ggml_init(lcparams);
init_model(&model);
model.name = basename(params.fn_llama2c_model);
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
ggml_free(model.ctx);
return 0;
}

View File

@@ -1,4 +0,0 @@
PandaGPT
MiniGPT-4
*.pth

View File

@@ -1,17 +0,0 @@
set(TARGET embdinput)
add_library(${TARGET} embd-input-lib.cpp embd-input.h)
install(TARGETS ${TARGET} LIBRARY)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()
set(TARGET embd-input-test)
add_executable(${TARGET} embd-input-test.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,63 +0,0 @@
### Examples for input embedding directly
## Requirement
build `libembdinput.so`
run the following comman in main dir (../../).
```
make
```
## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py)
1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
2. Convert it to ggml format.
3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
```
import torch
bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
pth_path = "./examples/embd-input/llava_projection.pth"
dic = torch.load(bin_path)
used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
torch.save({k: dic[k] for k in used_key}, pth_path)
```
4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
The `adapter_config.json` is
```
{
"peft_type": "LORA",
"fan_in_fan_out": false,
"bias": null,
"modules_to_save": null,
"r": 32,
"lora_alpha": 32,
"lora_dropout": 0.1,
"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
}
```
2. Papare the `vicuna` v0 model.
3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
4. Clone the PandaGPT source.
```
git clone https://github.com/yxuansu/PandaGPT
```
5. Install the requirement of PandaGPT.
6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
2. Clone the MiniGPT-4 source.
```
git clone https://github.com/Vision-CAIR/MiniGPT-4/
```
3. Install the requirement of PandaGPT.
4. Papare the `vicuna` v0 model.
5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.

View File

@@ -1,218 +0,0 @@
#include "embd-input.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
static llama_context ** g_ctx;
extern "C" {
struct MyModel* create_mymodel(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
return nullptr;
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = uint32_t(time(NULL));
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
llama_backend_init(params.numa);
llama_model * model;
llama_context * ctx;
g_ctx = &ctx;
// load the model and apply lora adapter, if any
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return nullptr;
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
}
struct MyModel * ret = new MyModel();
ret->ctx = ctx;
ret->params = params;
ret->n_past = 0;
// printf("ctx: %d\n", ret->ctx);
return ret;
}
void free_mymodel(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx;
llama_print_timings(ctx);
llama_free(ctx);
delete mymodel;
}
bool eval_float(void * model, float * input, int N){
MyModel * mymodel = (MyModel*)model;
llama_context * ctx = mymodel->ctx;
gpt_params params = mymodel->params;
int n_emb = llama_n_embd(ctx);
int n_past = mymodel->n_past;
int n_batch = N; // params.n_batch;
for (int i = 0; i < (int) N; i += n_batch) {
int n_eval = (int) N - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
if (llama_eval_embd(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
n_past += n_eval;
}
mymodel->n_past = n_past;
return true;
}
bool eval_tokens(void * model, std::vector<llama_token> tokens) {
MyModel * mymodel = (MyModel* )model;
llama_context * ctx;
ctx = mymodel->ctx;
gpt_params params = mymodel->params;
int n_past = mymodel->n_past;
for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
}
if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
n_past += n_eval;
}
mymodel->n_past = n_past;
return true;
}
bool eval_id(struct MyModel* mymodel, int id) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(mymodel, tokens);
}
bool eval_string(struct MyModel * mymodel,const char* str){
llama_context * ctx = mymodel->ctx;
std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
eval_tokens(mymodel, embd_inp);
return true;
}
llama_token sampling_id(struct MyModel* mymodel) {
llama_context* ctx = mymodel->ctx;
gpt_params params = mymodel->params;
// int n_ctx = llama_n_ctx(ctx);
// out of user input, sample next token
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
// const float repeat_penalty = params.repeat_penalty;
// const float alpha_presence = params.presence_penalty;
// const float alpha_frequency = params.frequency_penalty;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
// const bool penalize_nl = params.penalize_nl;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx);
// Apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// TODO: Apply penalties
// float nl_logit = logits[llama_token_nl(ctx)];
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
// llama_sample_repetition_penalty(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, repeat_penalty);
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, alpha_frequency, alpha_presence);
// if (!penalize_nl) {
// logits[llama_token_nl(ctx)] = nl_logit;
// }
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &candidates_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token(ctx, &candidates_p);
}
}
}
return id;
}
const char * sampling(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx;
int id = sampling_id(mymodel);
static std::string ret;
if (id == llama_token_eos(ctx)) {
ret = "</s>";
} else {
ret = llama_token_to_piece(ctx, id);
}
eval_id(mymodel, id);
return ret.c_str();
}
}

View File

@@ -1,35 +0,0 @@
#include "embd-input.h"
#include <stdlib.h>
#include <random>
#include <string.h>
int main(int argc, char** argv) {
auto mymodel = create_mymodel(argc, argv);
int N = 10;
int max_tgt_len = 500;
int n_embd = llama_n_embd(mymodel->ctx);
// add random float embd to test evaluation
float * data = new float[N*n_embd];
std::default_random_engine e;
std::uniform_real_distribution<float> u(0,1);
for (int i=0;i<N*n_embd;i++) {
data[i] = u(e);
}
eval_string(mymodel, "user: what is the color of the flag of UN?");
eval_float(mymodel, data, N);
eval_string(mymodel, "assistant:");
eval_string(mymodel, mymodel->params.prompt.c_str());
const char* tmp;
for (int i=0; i<max_tgt_len; i++) {
tmp = sampling(mymodel);
if (strcmp(tmp, "</s>")==0) break;
printf("%s", tmp);
fflush(stdout);
}
printf("\n");
free_mymodel(mymodel);
return 0;
}

View File

@@ -1,28 +0,0 @@
#ifndef _EMBD_INPUT_H_
#define _EMBD_INPUT_H_ 1
#include "common.h"
#include "llama.h"
#include "build-info.h"
extern "C" {
typedef struct MyModel {
llama_context* ctx;
gpt_params params;
int n_past = 0;
} MyModel;
struct MyModel* create_mymodel(int argc, char ** argv);
bool eval_float(void* model, float* input, int N);
bool eval_tokens(void* model, std::vector<llama_token> tokens);
bool eval_id(struct MyModel* mymodel, int id);
bool eval_string(struct MyModel* mymodel, const char* str);
const char * sampling(struct MyModel* mymodel);
llama_token sampling_id(struct MyModel* mymodel);
void free_mymodel(struct MyModel* mymodel);
}
#endif

View File

@@ -1,72 +0,0 @@
#!/usr/bin/env python3
import ctypes
from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
import numpy as np
import os
libc = cdll.LoadLibrary("./libembdinput.so")
libc.sampling.restype=c_char_p
libc.create_mymodel.restype=c_void_p
libc.eval_string.argtypes=[c_void_p, c_char_p]
libc.sampling.argtypes=[c_void_p]
libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
class MyModel:
def __init__(self, args):
argc = len(args)
c_str = [c_char_p(i.encode()) for i in args]
args_c = (c_char_p * argc)(*c_str)
self.model = c_void_p(libc.create_mymodel(argc, args_c))
self.max_tgt_len = 512
self.print_string_eval = True
def __del__(self):
libc.free_mymodel(self.model)
def eval_float(self, x):
libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
def eval_string(self, x):
libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
if self.print_string_eval:
print(x)
def eval_token(self, x):
libc.eval_id(self.model, x)
def sampling(self):
s = libc.sampling(self.model)
return s
def stream_generate(self, end="</s>"):
ret = b""
end = end.encode()
for _ in range(self.max_tgt_len):
tmp = self.sampling()
ret += tmp
yield tmp
if ret.endswith(end):
break
def generate_with_print(self, end="</s>"):
ret = b""
for i in self.stream_generate(end=end):
ret += i
print(i.decode(errors="replace"), end="", flush=True)
print("")
return ret.decode(errors="replace")
def generate(self, end="</s>"):
text = b"".join(self.stream_generate(end=end))
return text.decode(errors="replace")
if __name__ == "__main__":
model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
model.eval_string("""user: what is the color of the flag of UN?""")
x = np.random.random((5120,10))# , dtype=np.float32)
model.eval_float(x)
model.eval_string("""assistant:""")
for i in model.generate():
print(i.decode(errors="replace"), end="", flush=True)

View File

@@ -1,71 +0,0 @@
#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch
from transformers import CLIPVisionModel, CLIPImageProcessor
from PIL import Image
# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
vision_tower = "openai/clip-vit-large-patch14"
select_hidden_state_layer = -2
# (vision_config.image_size // vision_config.patch_size) ** 2
image_token_len = (224//14)**2
class Llava:
def __init__(self, args):
self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
self.mm_projector = nn.Linear(1024, 5120)
self.model = MyModel(["main", *args])
def load_projection(self, path):
state = torch.load(path)
self.mm_projector.load_state_dict({
"weight": state["model.mm_projector.weight"],
"bias": state["model.mm_projector.bias"]})
def chat(self, question):
self.model.eval_string("user: ")
self.model.eval_string(question)
self.model.eval_string("\nassistant: ")
return self.model.generate_with_print()
def chat_with_image(self, image, question):
with torch.no_grad():
embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
image_feature = select_hidden_state[:, 1:]
embd_image = self.mm_projector(image_feature)
embd_image = embd_image.cpu().numpy()[0]
self.model.eval_string("user: ")
self.model.eval_token(32003-2) # im_start
self.model.eval_float(embd_image.T)
for i in range(image_token_len-embd_image.shape[0]):
self.model.eval_token(32003-3) # im_patch
self.model.eval_token(32003-1) # im_end
self.model.eval_string(question)
self.model.eval_string("\nassistant: ")
return self.model.generate_with_print()
if __name__=="__main__":
# model form liuhaotian/LLaVA-13b-delta-v1-1
a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
# Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
# Also here can use pytorch_model-00003-of-00003.bin directly.
a.load_projection(os.path.join(
os.path.dirname(__file__) ,
"llava_projection.pth"))
respose = a.chat_with_image(
Image.open("./media/llama1-logo.png").convert('RGB'),
"what is the text in the picture?")
respose
a.chat("what is the color of it?")

View File

@@ -1,129 +0,0 @@
#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch
from PIL import Image
minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
sys.path.insert(0, minigpt4_path)
from minigpt4.models.blip2 import Blip2Base
from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
class MiniGPT4(Blip2Base):
"""
MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
"""
def __init__(self,
args,
vit_model="eva_clip_g",
q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
img_size=224,
drop_path_rate=0,
use_grad_checkpoint=False,
vit_precision="fp32",
freeze_vit=True,
freeze_qformer=True,
num_query_token=32,
llama_model="",
prompt_path="",
prompt_template="",
max_txt_len=32,
end_sym='\n',
low_resource=False, # use 8 bit and put vit in cpu
device_8bit=0
):
super().__init__()
self.img_size = img_size
self.low_resource = low_resource
self.preprocessor = Blip2ImageEvalProcessor(img_size)
print('Loading VIT')
self.visual_encoder, self.ln_vision = self.init_vision_encoder(
vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
)
print('Loading VIT Done')
print('Loading Q-Former')
self.Qformer, self.query_tokens = self.init_Qformer(
num_query_token, self.visual_encoder.num_features
)
self.Qformer.cls = None
self.Qformer.bert.embeddings.word_embeddings = None
self.Qformer.bert.embeddings.position_embeddings = None
for layer in self.Qformer.bert.encoder.layer:
layer.output = None
layer.intermediate = None
self.load_from_pretrained(url_or_filename=q_former_model)
print('Loading Q-Former Done')
self.llama_proj = nn.Linear(
self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
)
self.max_txt_len = max_txt_len
self.end_sym = end_sym
self.model = MyModel(["main", *args])
# system prompt
self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
"You will be able to see the image once I provide it to you. Please answer my questions."
"###")
def encode_img(self, image):
image = self.preprocessor(image)
image = image.unsqueeze(0)
device = image.device
if self.low_resource:
self.vit_to_cpu()
image = image.to("cpu")
with self.maybe_autocast():
image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
query_output = self.Qformer.bert(
query_embeds=query_tokens,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_atts,
return_dict=True,
)
inputs_llama = self.llama_proj(query_output.last_hidden_state)
# atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
return inputs_llama
def load_projection(self, path):
state = torch.load(path)["model"]
self.llama_proj.load_state_dict({
"weight": state["llama_proj.weight"],
"bias": state["llama_proj.bias"]})
def chat(self, question):
self.model.eval_string("Human: ")
self.model.eval_string(question)
self.model.eval_string("\n### Assistant:")
return self.model.generate_with_print(end="###")
def chat_with_image(self, image, question):
with torch.no_grad():
embd_image = self.encode_img(image)
embd_image = embd_image.cpu().numpy()[0]
self.model.eval_string("Human: <Img>")
self.model.eval_float(embd_image.T)
self.model.eval_string("</Img> ")
self.model.eval_string(question)
self.model.eval_string("\n### Assistant:")
return self.model.generate_with_print(end="###")
if __name__=="__main__":
a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
a.load_projection(os.path.join(
os.path.dirname(__file__) ,
"pretrained_minigpt4.pth"))
respose = a.chat_with_image(
Image.open("./media/llama1-logo.png").convert('RGB'),
"what is the text in the picture?")
a.chat("what is the color of it?")

View File

@@ -1,99 +0,0 @@
#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch
# use PandaGPT path
panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
imagebind_ckpt_path = "./models/panda_gpt/"
sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
from ImageBind.models import imagebind_model
from ImageBind import data
ModalityType = imagebind_model.ModalityType
max_tgt_len = 400
class PandaGPT:
def __init__(self, args):
self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
self.visual_encoder.eval()
self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
self.max_tgt_len = max_tgt_len
self.model = MyModel(["main", *args])
self.generated_text = ""
self.device = "cpu"
def load_projection(self, path):
state = torch.load(path, map_location="cpu")
self.llama_proj.load_state_dict({
"weight": state["llama_proj.weight"],
"bias": state["llama_proj.bias"]})
def eval_inputs(self, inputs):
self.model.eval_string("<Img>")
embds = self.extract_multimoal_feature(inputs)
for i in embds:
self.model.eval_float(i.T)
self.model.eval_string("</Img> ")
def chat(self, question):
return self.chat_with_image(None, question)
def chat_with_image(self, inputs, question):
if self.generated_text == "":
self.model.eval_string("###")
self.model.eval_string(" Human: ")
if inputs:
self.eval_inputs(inputs)
self.model.eval_string(question)
self.model.eval_string("\n### Assistant:")
ret = self.model.generate_with_print(end="###")
self.generated_text += ret
return ret
def extract_multimoal_feature(self, inputs):
features = []
for key in ["image", "audio", "video", "thermal"]:
if key + "_paths" in inputs:
embeds = self.encode_data(key, inputs[key+"_paths"])
features.append(embeds)
return features
def encode_data(self, data_type, data_paths):
type_map = {
"image": ModalityType.VISION,
"audio": ModalityType.AUDIO,
"video": ModalityType.VISION,
"thermal": ModalityType.THERMAL,
}
load_map = {
"image": data.load_and_transform_vision_data,
"audio": data.load_and_transform_audio_data,
"video": data.load_and_transform_video_data,
"thermal": data.load_and_transform_thermal_data
}
load_function = load_map[data_type]
key = type_map[data_type]
inputs = {key: load_function(data_paths, self.device)}
with torch.no_grad():
embeddings = self.visual_encoder(inputs)
embeds = embeddings[key]
embeds = self.llama_proj(embeds).cpu().numpy()
return embeds
if __name__=="__main__":
a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
a.load_projection("./models/panda_gpt/adapter_model.bin")
a.chat_with_image(
{"image_paths": ["./media/llama1-logo.png"]},
"what is the text in the picture? 'llama' or 'lambda'?")
a.chat("what is the color of it?")

View File

@@ -1,8 +1,4 @@
set(TARGET embedding)
add_executable(${TARGET} embedding.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,51 +1,55 @@
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <ctime>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
if (!gpt_params_parse(argc, argv, params)) {
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
params.embedding = true;
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
if (params.seed == LLAMA_DEFAULT_SEED) {
if (params.seed <= 0) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_backend_init(params.numa);
llama_model * model;
llama_context * ctx;
// load the model
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
{
auto lparams = llama_context_default_params();
const int n_ctx_train = llama_n_ctx_train(ctx);
if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
// print system information
@@ -57,48 +61,44 @@ int main(int argc, char ** argv) {
int n_past = 0;
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
if (params.verbose_prompt) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
}
fprintf(stderr, "\n");
}
if (embd_inp.size() > (size_t)params.n_ctx) {
fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
__func__, embd_inp.size(), params.n_ctx);
return 1;
}
while (!embd_inp.empty()) {
int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
if (params.embedding){
if (embd_inp.size() > 0) {
if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
n_past += n_tokens;
embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
}
const int n_embd = llama_n_embd(ctx);
const auto embeddings = llama_get_embeddings(ctx);
const int n_embd = llama_n_embd(ctx);
const auto embeddings = llama_get_embeddings(ctx);
for (int i = 0; i < n_embd; i++) {
printf("%f ", embeddings[i]);
for (int i = 0; i < n_embd; i++) {
printf("%f ", embeddings[i]);
}
printf("\n");
}
printf("\n");
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
return 0;
}

View File

@@ -1,5 +0,0 @@
set(TARGET gguf)
add_executable(${TARGET} gguf.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,249 +0,0 @@
#include "ggml.h"
#include "llama.h"
#include <cstdio>
#include <cinttypes>
#include <string>
#include <sstream>
#include <fstream>
#include <vector>
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
template<typename T>
static std::string to_string(const T & val) {
std::stringstream ss;
ss << val;
return ss.str();
}
bool gguf_ex_write(const std::string & fname) {
struct gguf_context * ctx = gguf_init_empty();
gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
gguf_set_val_i8 (ctx, "some.parameter.int8", -0x13);
gguf_set_val_u16 (ctx, "some.parameter.uint16", 0x1234);
gguf_set_val_i16 (ctx, "some.parameter.int16", -0x1235);
gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678);
gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679);
gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f);
gguf_set_val_u64 (ctx, "some.parameter.uint64", 0x123456789abcdef0ull);
gguf_set_val_i64 (ctx, "some.parameter.int64", -0x123456789abcdef1ll);
gguf_set_val_f64 (ctx, "some.parameter.float64", 0.1234567890123456789);
gguf_set_val_bool(ctx, "some.parameter.bool", true);
gguf_set_val_str (ctx, "some.parameter.string", "hello world");
gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16, std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
gguf_set_arr_str (ctx, "some.parameter.arr.str", std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
struct ggml_init_params params = {
/*.mem_size =*/ 128ull*1024ull*1024ull,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false,
};
struct ggml_context * ctx_data = ggml_init(params);
const int n_tensors = 10;
// tensor infos
for (int i = 0; i < n_tensors; ++i) {
const std::string name = "tensor_" + to_string(i);
int64_t ne[GGML_MAX_DIMS] = { 1 };
int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
for (int j = 0; j < n_dims; ++j) {
ne[j] = rand() % 10 + 1;
}
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
ggml_set_name(cur, name.c_str());
{
float * data = (float *) cur->data;
for (int j = 0; j < ggml_nelements(cur); ++j) {
data[j] = 100 + i;
}
}
gguf_add_tensor(ctx, cur);
}
gguf_write_to_file(ctx, fname.c_str(), false);
printf("%s: wrote file '%s;\n", __func__, fname.c_str());
ggml_free(ctx_data);
gguf_free(ctx);
return true;
}
// just read tensor info
bool gguf_ex_read_0(const std::string & fname) {
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ NULL,
};
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
// kv
{
const int n_kv = gguf_get_n_kv(ctx);
printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ctx, i);
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
}
}
// find kv string
{
const char * findkey = "some.parameter.string";
const int keyidx = gguf_find_key(ctx, findkey);
if (keyidx == -1) {
printf("%s: find key: %s not found.\n", __func__, findkey);
} else {
const char * key_value = gguf_get_val_str(ctx, keyidx);
printf("%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
}
}
// tensor info
{
const int n_tensors = gguf_get_n_tensors(ctx);
printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i);
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
}
}
gguf_free(ctx);
return true;
}
// read and create ggml_context containing the tensors and their data
bool gguf_ex_read_1(const std::string & fname) {
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
};
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
// kv
{
const int n_kv = gguf_get_n_kv(ctx);
printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ctx, i);
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
}
}
// tensor info
{
const int n_tensors = gguf_get_n_tensors(ctx);
printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i);
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
}
}
// data
{
const int n_tensors = gguf_get_n_tensors(ctx);
for (int i = 0; i < n_tensors; ++i) {
printf("%s: reading tensor %d data\n", __func__, i);
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
// print first 10 elements
const float * data = (const float *) cur->data;
printf("%s data[:10] : ", name);
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
printf("%f ", data[j]);
}
printf("\n\n");
// check data
{
const float * data = (const float *) cur->data;
for (int j = 0; j < ggml_nelements(cur); ++j) {
if (data[j] != 100 + i) {
fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
return false;
}
}
}
}
}
printf("%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
ggml_free(ctx_data);
gguf_free(ctx);
return true;
}
int main(int argc, char ** argv) {
if (argc < 3) {
printf("usage: %s data.gguf r|w\n", argv[0]);
return -1;
}
const std::string fname(argv[1]);
const std::string mode (argv[2]);
GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
if (mode == "w") {
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
} else if (mode == "r") {
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
}
return 0;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,21 +0,0 @@
# llama.cpp/example/jeopardy
This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
The jeopardy test can be used to compare the fact knowledge of different models and compare them to eachother. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
Step 1: Open jeopardy.sh and modify the following:
```
MODEL=(path to your model)
MODEL_NAME=(name of your model)
prefix=(basically, if you use vicuna it's Human: , if you use something else it might be User: , etc)
opts=(add -instruct here if needed for your model, or anything else you want to test out)
```
Step 2: Run `jeopardy.sh` from the llama.cpp folder
Step 3: Repeat steps 1 and 2 until you have all the results you need.
Step 4: Run `graph.py`, and follow the instructions. At the end, it will generate your final graph.
Note: The Human bar is based off of the full, original 100 sample questions. If you modify the question count or questions, it will not be valid.

View File

@@ -1,58 +0,0 @@
#!/usr/bin/env python3
import matplotlib.pyplot as plt
import os
import csv
labels = []
numbers = []
numEntries = 1
rows = []
def bar_chart(numbers, labels, pos):
plt.bar(pos, numbers, color='blue')
plt.xticks(ticks=pos, labels=labels)
plt.title("Jeopardy Results by Model")
plt.xlabel("Model")
plt.ylabel("Questions Correct")
plt.show()
def calculatecorrect():
directory = os.fsencode("./examples/jeopardy/results/")
csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
for row in csv_reader:
global rows
rows.append(row)
for listing in os.listdir(directory):
filename = os.fsdecode(listing)
if filename.endswith(".txt"):
file = open("./examples/jeopardy/results/" + filename, "rt")
global labels
global numEntries
global numbers
labels.append(filename[:-4])
numEntries += 1
i = 1
totalcorrect = 0
for line in file.readlines():
if line.strip() != "------":
print(line)
else:
print("Correct answer: " + rows[i][2] + "\n")
i += 1
print("Did the AI get the question right? (y/n)")
if input() == "y":
totalcorrect += 1
numbers.append(totalcorrect)
if __name__ == '__main__':
calculatecorrect()
pos = list(range(numEntries))
labels.append("Human")
numbers.append(48.11)
bar_chart(numbers, labels, pos)
print(labels)
print(numbers)

View File

@@ -1,30 +0,0 @@
#!/bin/bash
set -e
MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin
MODEL_NAME=Vicuna
# exec options
prefix="Human: " # Ex. Vicuna uses "Human: "
opts="--temp 0 -n 80" # additional flags
nl='
'
introduction="You will be playing a game of Jeopardy. Simply answer the question in the correct format (Ex. What is Paris, or Who is George Washington)."
# file options
question_file=./examples/jeopardy/questions.txt
touch ./examples/jeopardy/results/$MODEL_NAME.txt
output_file=./examples/jeopardy/results/$MODEL_NAME.txt
counter=1
echo 'Running'
while IFS= read -r question
do
exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
echo $counter
echo "Current Question: $question"
eval "$exe_cmd"
echo -e "\n------" >> $output_file
counter=$((counter+1))
done < "$question_file"

View File

@@ -1,103 +0,0 @@
Index,Original Category,Original Correct Question,Model Prompt
1,The Oscars,Who is John Williams?,Which actor Born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
2,English Literature,What is Paradise Lost?,"What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?"
3,Writers Lesser-Known Works,Who is Niccolò Machiavelli?,"Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?"
4,Exploration,What is Easter Island (Rapa Nui)?,"James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?"
5,The Bill of Rights,What is the Eighth Amendment?,England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
6,Nobel Peace Prize Winners,Who are Nelson Mandela & Desmond Tutu?,"Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?"
7,Famous Names,Who is Walt Disney?,"In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?"
8,Geography,What is Colombia?,"Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?"
9,Fashion History,What are rhinestones?,"Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?"
10,Movies of the 80s,What is Driving Miss Daisy?,What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
11,Novelists,Who is John Grisham?,"A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?"
12,20th Century Eponyms,What is the Maginot Line?,"A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?"
13,City History,What is Stockholm?,"Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?"
14,Brand Names,What is Jacuzzi?,"The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?"
15,American Authors,Who is Washington Irving?,"In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?"
16,Symbols,What is “less than”?,What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
17,Movie Theme Songs,Who is James Bond?,"Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?"
18,American Novelists,Who is Joseph Heller?,"What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?"
19,Medieval Places,"What is Canterbury, England? (Canterbury Cathedral)","In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?"
20,Countries of Africa,What is Morocco?,"At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?"
21,Statehood,What is Wyoming?,Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
22,1980s Movies,What is Raiders of the Lost Ark?,"A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?"
23,Art Exhibitions,Who is Rembrandt?,In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
24,Countries of the World,What is Mongolia?,"Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?"
25,Literature,What is “Howl”?,A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
26,Invasions,Who is William of Orange?,"Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?"
27,Landmarks,What is the Eiffel Tower?,"After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?"
28,Geographic Names the Same,What is Dover?,"The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?"
29,Names in the Bookstore,Who is Peter Mark Roget?,"This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?"
30,U.S. History,Who is Dr. Samuel Mudd?,"An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?"
31,American Literature,What is The Things They Carried?,"Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?"
32,Nonfiction,What is The Communist Manifesto,"What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?"
33, a new version was passed 81 years later,Laws in U.S. History,What is the Civil Rights Act?,,,,,,,,,,,,,,,,,,0, 2/3
34,Names of Myth,Who is Helen of Troy?,"Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?"
35,African Countries,What is Sudan?,"Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?"
36,The Ancient World,What is Alexandria?,"The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?"
37,Famous Names,Who is Andy Warhol?,"For a special 1970s cookbook, who provided one simple recipea can of Campbell's tomato soup & 2 cans of milk?"
38,People & Places,What is Guam?,"Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territorys largest ethnic group?"
39,Current World Leaders,What is the Philippines?,"In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?"
40,Writers & The South,Who is Tennessee Williams?,In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
41,National Parks,What is Yellowstone?,"What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?"
42,Sports,Who are the Harlem Globetrotters?,"In 2010 who introduced the 4-point shot, 35 feet from the basket?"
43,The U.S. Military,What is “Top Gun”?,Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
44,Art & Science,What is Halleys Comet?,"A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?"
45,Words From World War I,What is “tank”?,"In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?"
46,European History,What is Holy Roman Emperor?,"Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?"
47,Theater History,Who is Peter Pan?,"In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?"
48,European Cities,What is Aachen?,"Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?"
49,Word Origins,What is mantra?,This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
50,Inventions,What is barbed wire?,1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
51,World War II,What is Schindlers list?,"Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?"
52, their offspring was the source of this mythical object,Mythology,What is the Golden Fleece?
53,Literature,What is Pride and Prejudice?,"Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?"
54, only these 2 west of the Mississippi River border each other,U.S. State Names,What are Oregon & Nevada?
55,Word Origins,What is passion?,"Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?"
56,World Cinema,What is La Vie en Rose?,"The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?"
57,History,What is Santa Maria?,"Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?"
58,Landmarks,What is a kremlin?,Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
59,Foreign-Born Authors,Who is Vladimir Nabokov?,In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
60,Astronomy & Geography,What is Capricorn?,"At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?"
61,Television,What is Law & Order?,"Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?"
62,British Landmarks,What is the Tower of London?,"Like Sir Thomas More, 3 16th century English queens are buried at what British location?"
63,Early American History,What are witches?,"In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person … be condemned'?"
64,Geography Mnemonics,What are Arkansas and Louisiana?,"The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?"
65,Business Milestones,What is the Ford Model T?,"What was first sold in 1908, at a price equivalent to about $27,000 today?"
66,In The Bookstore,Who is Tom Clancy?,The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
67,Historic Art,What is the Bayeux Tapestry?,The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
68,Pop Stars,Who is Madonna?,In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
69,Classic Tale Characters,Who is Scheherazade?,"In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?"
70,USA,What is Jack Daniels?,"Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?"
71,Historic People,Who was William Bligh?,"After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?"
72,The Movies,What is The Godfather?,Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
73,Continental Geography,What is Colombia?,"Until a 1903 secession, what country's contiguous territory spanned 2 continents?"
74,Foreign-Born Authors,Who is Isabel Allende?,"Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?"
75,Historic Crimes,What is the Mona Lisa?,"Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?"
76,U.S. Bodies of Water,What is Lake Mead?,"Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?"
77,Gods & Goddesses,Who is Aurora (or Eos)?,"Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?"
78,America At War,What is the Battle of New Orleans?,"Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?"
79,Childrens Books,What is The Velveteen Rabbit?,"Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?"
80,TV Finales,What is Grace and Frankie?,"In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?"
81,American Poems,Who is Evangeline?,"In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?"
82,Famous Names,Who is Banksy?,"In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?"
83,Childrens Lit,What is Charlottes Web?,The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
84,Classic Songs,What is “Here Comes Santa Claus”?,The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
85,Brand Names,What are Milk Duds?,"Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?"
86,Countries of the World,What is Italy?,"What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?"
87,Action Movies,What is Die Hard?,"What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?"
88,Presidential Facts,Who is Woodrow Wilson?,Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
89,19th Century Americans,Who is Frederick Douglass?,"Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?"
90,Latin Phrases,What is “quid pro quo”?,"Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?"
91,1970s Movies,What is Monty Python and the Holy Grail?,The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
92,Names The Same,What is Manhattan?,"A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?"
93,U.S. Presidents,Who is Calvin Coolidge?,"Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?"
94,Plays,What is The Tempest?,A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
95,Landmarks,What is the Berlin Wall?,"In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?"
96,World Capitals,"What is Vienna, Austria?","Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?"
97,Language & Its Meanings,What is a night owl?,"Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?"
98,Flags of Our Hemisphere,What is Brazil?,"The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?"
99,Names in U.S. History,Who is Oliver Brown?,What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
100,Childrens Authors,"Who is Sarah? (from Sarah, Plain and Tall)","Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?"
,,,
TOTALS,,,
1 Index Original Category Original Correct Question Model Prompt
2 1 The Oscars Who is John Williams? Which actor Born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
3 2 English Literature What is Paradise Lost? What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?
4 3 Writers’ Lesser-Known Works Who is Niccolò Machiavelli? Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?
5 4 Exploration What is Easter Island (Rapa Nui)? James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?
6 5 The Bill of Rights What is the Eighth Amendment? England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
7 6 Nobel Peace Prize Winners Who are Nelson Mandela & Desmond Tutu? Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?
8 7 Famous Names Who is Walt Disney? In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?
9 8 Geography What is Colombia? Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?
10 9 Fashion History What are rhinestones? Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?
11 10 Movies of the ’80s What is Driving Miss Daisy? What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
12 11 Novelists Who is John Grisham? A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?
13 12 20th Century Eponyms What is the Maginot Line? A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?
14 13 City History What is Stockholm? Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?
15 14 Brand Names What is Jacuzzi? The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?
16 15 American Authors Who is Washington Irving? In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?
17 16 Symbols What is “less than”? What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
18 17 Movie Theme Songs Who is James Bond? Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?
19 18 American Novelists Who is Joseph Heller? What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?
20 19 Medieval Places What is Canterbury, England? (Canterbury Cathedral) In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?
21 20 Countries of Africa What is Morocco? At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?
22 21 Statehood What is Wyoming? Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
23 22 1980s Movies What is Raiders of the Lost Ark? A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?
24 23 Art Exhibitions Who is Rembrandt? In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
25 24 Countries of the World What is Mongolia? Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?
26 25 Literature What is “Howl”? A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
27 26 Invasions Who is William of Orange? Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?
28 27 Landmarks What is the Eiffel Tower? After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?
29 28 Geographic Name’s the Same What is Dover? The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?
30 29 Names in the Bookstore Who is Peter Mark Roget? This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?
31 30 U.S. History Who is Dr. Samuel Mudd? An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?
32 31 American Literature What is The Things They Carried? Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?
33 32 Nonfiction What is The Communist Manifesto What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?
34 33 a new version was passed 81 years later Laws in U.S. History What is the Civil Rights Act? 0 2/3
35 34 Names of Myth Who is Helen of Troy? Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?
36 35 African Countries What is Sudan? Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?
37 36 The Ancient World What is Alexandria? The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?
38 37 Famous Names Who is Andy Warhol? For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk?
39 38 People & Places What is Guam? Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group?
40 39 Current World Leaders What is the Philippines? In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?
41 40 Writers & The South Who is Tennessee Williams? In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
42 41 National Parks What is Yellowstone? What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?
43 42 Sports Who are the Harlem Globetrotters? In 2010 who introduced the 4-point shot, 35 feet from the basket?
44 43 The U.S. Military What is “Top Gun”? Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
45 44 Art & Science What is Halley’s Comet? A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?
46 45 Words From World War I What is “tank”? In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?
47 46 European History What is Holy Roman Emperor? Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?
48 47 Theater History Who is Peter Pan? In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?
49 48 European Cities What is Aachen? Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?
50 49 Word Origins What is mantra? This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
51 50 Inventions What is barbed wire? 1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
52 51 World War II What is Schindler’s list? Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?
53 52 their offspring was the source of this mythical object Mythology What is the Golden Fleece?
54 53 Literature What is Pride and Prejudice? Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?
55 54 only these 2 west of the Mississippi River border each other U.S. State Names What are Oregon & Nevada?
56 55 Word Origins What is passion? Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?
57 56 World Cinema What is La Vie en Rose? The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?
58 57 History What is Santa Maria? Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?
59 58 Landmarks What is a kremlin? Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
60 59 Foreign-Born Authors Who is Vladimir Nabokov? In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
61 60 Astronomy & Geography What is Capricorn? At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?
62 61 Television What is Law & Order? Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?
63 62 British Landmarks What is the Tower of London? Like Sir Thomas More, 3 16th century English queens are buried at what British location?
64 63 Early American History What are witches? In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person … be condemned'?
65 64 Geography Mnemonics What are Arkansas and Louisiana? The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?
66 65 Business Milestones What is the Ford Model T? What was first sold in 1908, at a price equivalent to about $27,000 today?
67 66 In The Bookstore Who is Tom Clancy? The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
68 67 Historic Art What is the Bayeux Tapestry? The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
69 68 Pop Stars Who is Madonna? In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
70 69 Classic Tale Characters Who is Scheherazade? In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?
71 70 USA What is Jack Daniel’s? Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?
72 71 Historic People Who was William Bligh? After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?
73 72 The Movies What is The Godfather? Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
74 73 Continental Geography What is Colombia? Until a 1903 secession, what country's contiguous territory spanned 2 continents?
75 74 Foreign-Born Authors Who is Isabel Allende? Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?
76 75 Historic Crimes What is the Mona Lisa? Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?
77 76 U.S. Bodies of Water What is Lake Mead? Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?
78 77 Gods & Goddesses Who is Aurora (or Eos)? Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?
79 78 America At War What is the Battle of New Orleans? Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?
80 79 Children’s Books What is The Velveteen Rabbit? Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?
81 80 TV Finales What is Grace and Frankie? In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?
82 81 American Poems Who is Evangeline? In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?
83 82 Famous Names Who is Banksy? In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?
84 83 Children’s Lit What is Charlotte’s Web? The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
85 84 Classic Songs What is “Here Comes Santa Claus”? The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
86 85 Brand Names What are Milk Duds? Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?
87 86 Countries of the World What is Italy? What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?
88 87 Action Movies What is Die Hard? What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?
89 88 Presidential Facts Who is Woodrow Wilson? Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
90 89 19th Century Americans Who is Frederick Douglass? Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?
91 90 Latin Phrases What is “quid pro quo”? Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?
92 91 1970s Movies What is Monty Python and the Holy Grail? The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
93 92 Name’s The Same What is Manhattan? A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?
94 93 U.S. Presidents Who is Calvin Coolidge? Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?
95 94 Plays What is The Tempest? A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
96 95 Landmarks What is the Berlin Wall? In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?
97 96 World Capitals What is Vienna, Austria? Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?
98 97 Language & Its Meanings What is a night owl? Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?
99 98 Flags of Our Hemisphere What is Brazil? The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?
100 99 Names in U.S. History Who is Oliver Brown? What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
101 100 Children’s Authors Who is Sarah? (from Sarah, Plain and Tall) Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?
102
103 TOTALS

View File

@@ -1,100 +0,0 @@
Which man born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?
Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?
James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?
England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?
In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?
Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?
Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?
What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?
A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?
Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?
The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?
In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?
What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?
What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?
In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?
At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?
Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?
In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?
A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?
After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?
The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?
This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?
An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?
Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?
What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?
A radical Republican championed what 1875 act but the Supreme Court struck it down in 1883; a new version was passed 81 years later?
Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?
Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?
The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?
For a special 1970s cookbook, who provided one simple recipea can of Campbell's tomato soup & 2 cans of milk?
Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territorys largest ethnic group?
In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?
In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?
In 2010 who introduced the 4-point shot, 35 feet from the basket?
Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?
In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?
Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?
In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?
Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?
This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?
Poseidon carried off the maiden Theophane & turned her into a ewe; their offspring was the source of what mythical object?
Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?
5 U.S. states have 6-letter names; only which 2 west of the Mississippi River border each other?
Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?
The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?
Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?
Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?
Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?
Like Sir Thomas More, 3 16th century English queens are buried at what British location?
In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person be condemned'?
The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?
What was first sold in 1908, at a price equivalent to about $27,000 today?
The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?
Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?
After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?
Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
Until a 1903 secession, what country's contiguous territory spanned 2 continents?
Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?
Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?
Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?
Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?
Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?
Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?
In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?
In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?
In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?
The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?
What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?
What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?
Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?
Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?
The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?
Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?
A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?
Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?
Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?
The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?
What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?

View File

@@ -1,133 +0,0 @@
#!/usr/bin/env python3
import argparse
import json
import re
import sys
# whitespace is constrained to a single space char to prevent model "running away" in
# whitespace. Also maybe improves generation quality?
SPACE_RULE = '" "?'
PRIMITIVE_RULES = {
'boolean': '("true" | "false") space',
'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
'string': r''' "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space ''',
'null': '"null" space',
}
INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'}
class SchemaConverter:
def __init__(self, prop_order):
self._prop_order = prop_order
self._rules = {'space': SPACE_RULE}
def _format_literal(self, literal):
escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
)
return f'"{escaped}"'
def _add_rule(self, name, rule):
esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
if esc_name not in self._rules or self._rules[esc_name] == rule:
key = esc_name
else:
i = 0
while f'{esc_name}{i}' in self._rules:
i += 1
key = f'{esc_name}{i}'
self._rules[key] = rule
return key
def visit(self, schema, name):
schema_type = schema.get('type')
rule_name = name or 'root'
if 'oneOf' in schema or 'anyOf' in schema:
rule = ' | '.join((
self.visit(alt_schema, f'{name}{"-" if name else ""}{i}')
for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf'])
))
return self._add_rule(rule_name, rule)
elif 'const' in schema:
return self._add_rule(rule_name, self._format_literal(schema['const']))
elif 'enum' in schema:
rule = ' | '.join((self._format_literal(v) for v in schema['enum']))
return self._add_rule(rule_name, rule)
elif schema_type == 'object' and 'properties' in schema:
# TODO: `required` keyword
prop_order = self._prop_order
prop_pairs = sorted(
schema['properties'].items(),
# sort by position in prop_order (if specified) then by key
key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
)
rule = '"{" space'
for i, (prop_name, prop_schema) in enumerate(prop_pairs):
prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
if i > 0:
rule += ' "," space'
rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}'
rule += ' "}" space'
return self._add_rule(rule_name, rule)
elif schema_type == 'array' and 'items' in schema:
# TODO `prefixItems` keyword
item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
return self._add_rule(rule_name, rule)
else:
assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
return self._add_rule(
'root' if rule_name == 'root' else schema_type,
PRIMITIVE_RULES[schema_type]
)
def format_grammar(self):
return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items()))
def main(args_in = None):
parser = argparse.ArgumentParser(
description='''
Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
given JSON schema. Only a subset of JSON schema features are supported; more may be
added in the future.
''',
)
parser.add_argument(
'--prop-order',
default=[],
type=lambda s: s.split(','),
help='''
comma-separated property names defining the order of precedence for object properties;
properties not specified here are given lower precedence than those that are, and are
sorted alphabetically
'''
)
parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
args = parser.parse_args(args_in)
schema = json.load(sys.stdin if args.schema == '-' else open(args.schema))
prop_order = {name: idx for idx, name in enumerate(args.prop_order)}
converter = SchemaConverter(prop_order)
converter.visit(schema, '')
print(converter.format_grammar())
if __name__ == '__main__':
main()

View File

@@ -1,8 +0,0 @@
set(TARGET llama-bench)
add_executable(${TARGET} llama-bench.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

File diff suppressed because it is too large Load Diff

View File

@@ -1,132 +0,0 @@
" Requires an already running llama.cpp server
" To install either copy or symlink to ~/.vim/autoload/llama.vim
" Then start with either :call llama#doLlamaGen(),
" or add a keybind to your vimrc such as
" nnoremap Z :call llama#doLlamaGen()<CR>
" Similarly, you could add an insert mode keybind with
" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
"
" g:llama_api_url and g:llama_overrides can be configured in your .vimrc
" let g:llama_api_url = "192.168.1.10:8080"
" llama_overrides can also be set through buffer/window scopes. For instance
" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
" Could be added to your .vimrc to automatically set a lower temperature when
" editing a python script
" Additionally, an override dict can be stored at the top of a file
" !*{"stop": ["User:"]}
" Could be added to the start of your chatlog.txt to set the stopping token
" These parameter dicts are merged together from lowest to highest priority:
" server default -> g:llama_overrides -> w:llama_overrides ->
" b:llama_overrides -> in file (!*) overrides
"
" Sublists (like logit_bias and stop) are overridden, not merged
" Example override:
" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647}
if !exists("g:llama_api_url")
let g:llama_api_url= "127.0.0.1:8080"
endif
if !exists("g:llama_overrides")
let g:llama_overrides = {}
endif
const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true }
const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"]
let s:linedict = {}
func s:callbackHandler(bufn, channel, msg)
if len(a:msg) < 3
return
elseif a:msg[0] == "d"
let l:msg = a:msg[6:-1]
else
let l:msg = a:msg
endif
let l:decoded_msg = json_decode(l:msg)
let l:newtext = split(l:decoded_msg['content'], "\n", 1)
if len(l:newtext) > 0
call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0])
else
echo "nothing genned"
endif
if len(newtext) > 1
let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1])
let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1
endif
if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop
echo "Finished generation"
endif
endfunction
func llama#doLlamaGen()
if exists("b:job")
if job_status(b:job) == "run"
call job_stop(b:job)
return
endif
endif
let l:cbuffer = bufnr("%")
let s:linedict[l:cbuffer] = line('$')
let l:buflines = getbufline(l:cbuffer, 1, 1000)
let l:querydata = copy(s:querydata)
call extend(l:querydata, g:llama_overrides)
if exists("w:llama_overrides")
call extend(l:querydata, w:llama_overrides)
endif
if exists("b:llama_overrides")
call extend(l:querydata, b:llama_overrides)
endif
if l:buflines[0][0:1] == '!*'
let l:userdata = json_decode(l:buflines[0][2:-1])
call extend(l:querydata, l:userdata)
let l:buflines = l:buflines[1:-1]
endif
let l:querydata.prompt = join(l:buflines, "\n")
let l:curlcommand = copy(s:curlcommand)
let l:curlcommand[2] = json_encode(l:querydata)
let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
endfunction
" Echos the tokkenization of the provided string , or cursor to end of word
" Onus is placed on the user to include the preceding space
func llama#tokenizeWord(...)
if (a:0 > 0)
let l:input = a:1
else
exe "normal \"*ye"
let l:input = @*
endif
let l:querydata = {"content": l:input}
let l:curlcommand = copy(s:curlcommand)
let l:curlcommand[2] = json_encode(l:querydata)
let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])})
endfunction
func s:tokenizeWordCallback(plaintext, channel, msg)
echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens)
endfunction
" Echos the token count of the entire buffer (or provided string)
" Example usage :echo llama#tokenCount()
func llama#tokenCount(...)
if (a:0 > 0)
let l:buflines = a:1
else
let l:buflines = getline(1,1000)
if l:buflines[0][0:1] == '!*'
let l:buflines = l:buflines[1:-1]
endif
let l:buflines = join(l:buflines, "\n")
endif
let l:querydata = {"content": l:buflines}
let l:curlcommand = copy(s:curlcommand)
let l:curlcommand[2] = json_encode(l:querydata)
let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"})
endfunction
func s:tokenCountCallback(channel, msg)
let resp = json_decode(a:msg)
echo len(resp.tokens)
endfunction

View File

@@ -1,18 +0,0 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
--temp 0.2 \
--repeat_penalty 1.1 \
-t 8

View File

@@ -1,18 +0,0 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
--temp 0.2 \
--repeat_penalty 1.1 \
-t 8

View File

@@ -1,28 +0,0 @@
" Basic plugin example
function! Llm()
let url = "http://127.0.0.1:8080/completion"
" Get the content of the current buffer
let buffer_content = join(getline(1, '$'), "\n")
" Create the JSON payload
let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":256,"stop": ["\n\n\n"],"stream": v:false}
let json_payload.prompt = buffer_content
" Define the curl command
let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url
let response = system(curl_command, json_encode(json_payload))
" Extract the content field from the response
let content = json_decode(response).content
let split_newlines = split(content, '\n', 1)
" Insert the content at the cursor position
call setline(line('.'), [ getline('.') . split_newlines[0] ] + split_newlines[1:])
endfunction
command! Llm call Llm()
noremap <F2> :Llm<CR>

View File

@@ -1,51 +0,0 @@
# Prerequisites
*.d
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
*.smod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
*.gguf
*.log
.DS_Store
.build/
.cache/
.direnv/
.envrc
.swiftpm
.venv
.clang-tidy
.vs/
.vscode/
build*/
out/
tmp/

View File

@@ -1,36 +0,0 @@
cmake_minimum_required(VERSION 3.12)
project("main-cmake-pkg" C CXX)
set(TARGET main-cmake-pkg)
find_package(Llama 0.0.1 REQUIRED)
# Bake common functionality in with target. Because applications
# using the relocatable Llama package should be outside of the
# source tree, main-cmake-pkg pretends the dependencies are built-in.
set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
add_library(common OBJECT
${_common_path}/common.h
${_common_path}/common.cpp
${_common_path}/console.h
${_common_path}/console.cpp
${_common_path}/grammar-parser.h
${_common_path}/grammar-parser.cpp
)
# WARNING: because build-info.h is auto-generated, it will only
# be available after the user has built the llama.cpp sources.
#
configure_file(${_common_path}/../build-info.h
${CMAKE_CURRENT_BINARY_DIR}/build-info.h
COPYONLY)
target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR}
${CMAKE_CURRENT_BINARY_DIR})
add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
target_include_directories(${TARGET} PRIVATE ${_common_path})
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,37 +0,0 @@
# llama.cpp/example/main-cmake-pkg
This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
## Building
Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
### Considerations
When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
### Build llama.cpp and install to C:\LlamaCPP directory
In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
```cmd
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
mkdir build
cd build
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
cmake --build . --config Release
cmake --install . --prefix C:/LlamaCPP
```
### Build main-cmake-pkg
```cmd
cd ..\examples\main-cmake-pkg
mkdir build
cd build
cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
cmake --build . --config Release
cmake --install . --prefix C:/MyLlamaApp
```

Some files were not shown because too many files have changed in this diff Show More