Compare commits

..

1 Commits

Author SHA1 Message Date
Georgi Gerganov
731e7528be server : fix --threads-http arg 2024-06-06 16:37:12 +03:00
598 changed files with 68350 additions and 84618 deletions

View File

@@ -15,7 +15,7 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
stage('Running llama.cpp'){
sh'''#!/bin/bash
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
cat llama_log.txt # Printing results
'''
}

View File

@@ -27,7 +27,7 @@ COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV GGML_CUDA=1
ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1

View File

@@ -36,7 +36,7 @@ COPY . .
# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV GGML_HIPBLAS=1
ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++

View File

@@ -1,26 +0,0 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
WORKDIR /app
COPY . .
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build build --config Release --target llama-cli
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-cli" ]

View File

@@ -0,0 +1,84 @@
# SRPM for building from source and packaging an RPM for RPM-based distros.
# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
# Built and maintained by John Boero - boeroboy@gmail.com
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
# Notes for llama.cpp:
# 1. Tags are currently based on hash - which will not sort asciibetically.
# We need to declare standard versioning if people want to sort latest releases.
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
# It is up to the user to install the correct vendor-specific support.
Name: llama.cpp-clblast
Version: %( date "+%%Y%%m%%d" )
Release: 1%{?dist}
Summary: OpenCL Inference of LLaMA model in C/C++
License: MIT
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
Requires: clblast
URL: https://github.com/ggerganov/llama.cpp
%define debug_package %{nil}
%define source_date_epoch_from_changelog 0
%description
CPU inference for Meta's Lllama2 models using default options.
%prep
%setup -n llama.cpp-master
%build
make -j LLAMA_CLBLAST=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llamaclblast
cp -p server %{buildroot}%{_bindir}/llamaclblastserver
cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
[Unit]
Description=Llama.cpp server, CPU only (no GPU support in this build).
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
[Service]
Type=simple
EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID
Restart=never
[Install]
WantedBy=default.target
EOF
mkdir -p %{buildroot}/etc/sysconfig
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
EOF
%clean
rm -rf %{buildroot}
rm -rf %{_builddir}/*
%files
%{_bindir}/llamaclblast
%{_bindir}/llamaclblastserver
%{_bindir}/llamaclblastsimple
/usr/lib/systemd/system/llamaclblast.service
%config /etc/sysconfig/llama
%pre
%post
%preun
%postun
%changelog

View File

@@ -32,13 +32,13 @@ CPU inference for Meta's Lllama2 models using default options.
%setup -n llama.cpp-master
%build
make -j GGML_CUDA=1
make -j LLAMA_CUDA=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
cp -p main %{buildroot}%{_bindir}/llamacppcuda
cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
@@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
[Service]
Type=simple
EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID
Restart=never
@@ -67,9 +67,9 @@ rm -rf %{buildroot}
rm -rf %{_builddir}/*
%files
%{_bindir}/llama-cuda-cli
%{_bindir}/llama-cuda-server
%{_bindir}/llama-cuda-simple
%{_bindir}/llamacppcuda
%{_bindir}/llamacppcudaserver
%{_bindir}/llamacppcudasimple
/usr/lib/systemd/system/llamacuda.service
%config /etc/sysconfig/llama

View File

@@ -38,9 +38,9 @@ make -j
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
cp -p llama-server %{buildroot}%{_bindir}/llama-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
cp -p main %{buildroot}%{_bindir}/llama
cp -p server %{buildroot}%{_bindir}/llamaserver
cp -p simple %{buildroot}%{_bindir}/llamasimple
mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
@@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
[Service]
Type=simple
EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llama-server $LLAMA_ARGS
ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID
Restart=never
@@ -69,9 +69,9 @@ rm -rf %{buildroot}
rm -rf %{_builddir}/*
%files
%{_bindir}/llama-cli
%{_bindir}/llama-server
%{_bindir}/llama-simple
%{_bindir}/llama
%{_bindir}/llamaserver
%{_bindir}/llamasimple
/usr/lib/systemd/system/llama.service
%config /etc/sysconfig/llama

View File

@@ -1,31 +0,0 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release --target llama-server
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev curl
COPY --from=build /app/build/bin/llama-server /llama-server
ENV LC_ALL=C.utf8
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View File

@@ -21,15 +21,15 @@ COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV GGML_CUDA=1
ENV LLAMA_CUDA=1
RUN make -j$(nproc) llama-cli
RUN make -j$(nproc) main
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
RUN apt-get update && \
apt-get install -y libgomp1
COPY --from=build /app/llama-cli /llama-cli
COPY --from=build /app/main /main
ENTRYPOINT [ "/llama-cli" ]
ENTRYPOINT [ "/main" ]

View File

@@ -0,0 +1,34 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
WORKDIR /app
COPY . .
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build build --config Release --target main
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
COPY --from=build /app/build/bin/main /main
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ]

View File

@@ -36,10 +36,10 @@ COPY . .
# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV GGML_HIPBLAS=1
ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make -j$(nproc) llama-cli
RUN make -j$(nproc) main
ENTRYPOINT [ "/app/llama-cli" ]
ENTRYPOINT [ "/app/main" ]

View File

@@ -14,14 +14,14 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
# Build it
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_VULKAN=1 && \
cmake --build build --config Release --target llama-cli
RUN cmake -B build -DLLAMA_VULKAN=1 && \
cmake --build build --config Release --target main
# Clean up
WORKDIR /
RUN cp /app/build/bin/llama-cli /llama-cli && \
RUN cp /app/build/bin/main /main && \
rm -rf /app
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-cli" ]
ENTRYPOINT [ "/main" ]

View File

@@ -9,15 +9,15 @@ WORKDIR /app
COPY . .
RUN make -j$(nproc) llama-cli
RUN make -j$(nproc) main
FROM ubuntu:$UBUNTU_VERSION as runtime
RUN apt-get update && \
apt-get install -y libgomp1
COPY --from=build /app/llama-cli /llama-cli
COPY --from=build /app/main /main
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-cli" ]
ENTRYPOINT [ "/main" ]

View File

@@ -6,11 +6,11 @@
let
inherit (config.packages) default;
binaries = [
"llama-cli"
"llama"
"llama-embedding"
"llama-server"
"llama-quantize"
"llama-train-text-from-scratch"
"quantize"
"train-text-from-scratch"
];
mkApp = name: {
type = "app";

View File

@@ -17,18 +17,19 @@
rocmPackages,
vulkan-headers,
vulkan-loader,
curl,
clblast,
useBlas ? builtins.all (x: !x) [
useCuda
useMetalKit
useOpenCL
useRocm
useVulkan
] && blas.meta.available,
useCuda ? config.cudaSupport,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
useMpi ? false, # Increases the runtime closure size by ~700M
useOpenCL ? false,
useRocm ? config.rocmSupport,
enableCurl ? true,
useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@@ -55,6 +56,7 @@ let
++ lib.optionals useCuda [ "CUDA" ]
++ lib.optionals useMetalKit [ "MetalKit" ]
++ lib.optionals useMpi [ "MPI" ]
++ lib.optionals useOpenCL [ "OpenCL" ]
++ lib.optionals useRocm [ "ROCm" ]
++ lib.optionals useVulkan [ "Vulkan" ];
@@ -158,9 +160,9 @@ effectiveStdenv.mkDerivation (
};
postPatch = ''
substituteInPlace ./ggml/src/ggml-metal.m \
substituteInPlace ./ggml-metal.m \
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
substituteInPlace ./ggml/src/ggml-metal.m \
substituteInPlace ./ggml-metal.m \
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
'';
@@ -196,24 +198,24 @@ effectiveStdenv.mkDerivation (
optionals effectiveStdenv.isDarwin darwinBuildInputs
++ optionals useCuda cudaBuildInputs
++ optionals useMpi [ mpi ]
++ optionals useOpenCL [ clblast ]
++ optionals useRocm rocmBuildInputs
++ optionals useBlas [ blas ]
++ optionals useVulkan vulkanBuildInputs
++ optionals enableCurl [ curl ];
++ optionals useVulkan vulkanBuildInputs;
cmakeFlags =
[
(cmakeBool "LLAMA_NATIVE" false)
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_CURL" enableCurl)
(cmakeBool "GGML_NATIVE" false)
(cmakeBool "GGML_BLAS" useBlas)
(cmakeBool "GGML_CUDA" useCuda)
(cmakeBool "GGML_HIPBLAS" useRocm)
(cmakeBool "GGML_METAL" useMetalKit)
(cmakeBool "GGML_VULKAN" useVulkan)
(cmakeBool "GGML_STATIC" enableStatic)
(cmakeBool "LLAMA_BLAS" useBlas)
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
(cmakeBool "LLAMA_CUDA" useCuda)
(cmakeBool "LLAMA_HIPBLAS" useRocm)
(cmakeBool "LLAMA_METAL" useMetalKit)
(cmakeBool "LLAMA_VULKAN" useVulkan)
(cmakeBool "LLAMA_STATIC" enableStatic)
]
++ optionals useCuda [
(
@@ -229,7 +231,7 @@ effectiveStdenv.mkDerivation (
]
++ optionals useMetalKit [
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
(cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
(cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
];
# Environment variables needed for ROCm
@@ -241,8 +243,10 @@ effectiveStdenv.mkDerivation (
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
# if they haven't been added yet.
postInstall = ''
mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
mkdir -p $out/include
cp $src/include/llama.h $out/include/
cp $src/llama.h $out/include/
'';
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
@@ -252,6 +256,7 @@ effectiveStdenv.mkDerivation (
useCuda
useMetalKit
useMpi
useOpenCL
useRocm
useVulkan
;
@@ -278,7 +283,7 @@ effectiveStdenv.mkDerivation (
# Configurations we don't want even the CI to evaluate. Results in the
# "unsupported platform" messages. This is mostly a no-op, because
# cudaPackages would've refused to evaluate anyway.
badPlatforms = optionals useCuda lib.platforms.darwin;
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
# Configurations that are known to result in build failures. Can be
# overridden by importing Nixpkgs with `allowBroken = true`.
@@ -289,7 +294,7 @@ effectiveStdenv.mkDerivation (
license = lib.licenses.mit;
# Accommodates `nix run` and `lib.getExe`
mainProgram = "llama-cli";
mainProgram = "llama";
# These people might respond, on the best effort basis, if you ping them
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.

View File

@@ -21,19 +21,17 @@ COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV GGML_CUDA=1
ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
RUN make -j$(nproc) llama-server
RUN make -j$(nproc) server
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl
apt-get install -y libcurl4-openssl-dev libgomp1
COPY --from=build /app/llama-server /llama-server
COPY --from=build /app/server /server
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]
ENTRYPOINT [ "/server" ]

View File

@@ -0,0 +1,45 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release --target server
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
COPY --from=build /app/build/bin/server /server
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ]

View File

@@ -36,17 +36,15 @@ COPY . .
# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV GGML_HIPBLAS=1
ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
# Enable cURL
ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev curl
apt-get install -y libcurl4-openssl-dev
RUN make -j$(nproc) llama-server
RUN make -j$(nproc)
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]
ENTRYPOINT [ "/app/server" ]

View File

@@ -5,25 +5,27 @@ FROM ubuntu:$UBUNTU_VERSION as build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
# Install Vulkan SDK and cURL
# Install Vulkan SDK
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
apt update -y && \
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
apt-get install -y vulkan-sdk
# Install cURL
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
# Build it
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
cmake --build build --config Release --target llama-server
RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
cmake --build build --config Release --target server
# Clean up
WORKDIR /
RUN cp /app/build/bin/llama-server /llama-server && \
RUN cp /app/build/bin/server /server && \
rm -rf /app
ENV LC_ALL=C.utf8
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]
ENTRYPOINT [ "/server" ]

View File

@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev curl
apt-get install -y build-essential git libcurl4-openssl-dev
WORKDIR /app
@@ -11,17 +11,15 @@ COPY . .
ENV LLAMA_CURL=1
RUN make -j$(nproc) llama-server
RUN make -j$(nproc) server
FROM ubuntu:$UBUNTU_VERSION as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1
COPY --from=build /app/llama-server /llama-server
COPY --from=build /app/server /server
ENV LC_ALL=C.utf8
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]
ENTRYPOINT [ "/server" ]

View File

@@ -10,11 +10,11 @@ shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert-hf-to-gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./llama-quantize "$@"
./quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./llama-cli "$@"
./main "$@"
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
./llama-finetune "$@"
./finetune "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do
@@ -22,11 +22,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
./quantize "$i" "${i/f16/q4_0}" q4_0
fi
done
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
./llama-server "$@"
./server "$@"
else
echo "Unknown command: $arg1"
echo "Available commands: "

View File

@@ -12,8 +12,8 @@ build*/
models/*
/llama-cli
/llama-quantize
/main
/quantize
arm_neon.h
compile_commands.json

View File

@@ -26,7 +26,3 @@ indent_size = 2
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
indent_style = tab
[examples/cvector-generator/*.txt]
trim_trailing_whitespace = unset
insert_final_newline = unset

View File

@@ -24,7 +24,7 @@ body:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./llama-cli --version
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:

View File

@@ -24,7 +24,7 @@ body:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./llama-cli --version
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:

View File

@@ -24,7 +24,7 @@ body:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./llama-cli --version
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:

View File

@@ -24,7 +24,7 @@ body:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./llama-cli --version
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:

View File

@@ -9,3 +9,5 @@ contact_links:
- name: Want to contribute?
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
about: Head to the contribution guide page of the wiki for areas you can help with

29
.github/labeler.yml vendored
View File

@@ -2,31 +2,31 @@
Kompute:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-kompute.h
- ggml/src/ggml-kompute.cpp
- ggml-kompute.h
- ggml-kompute.cpp
- README-kompute.md
Apple Metal:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-metal.h
- ggml/src/ggml-metal.cpp
- ggml-metal.h
- ggml-metal.cpp
- README-metal.md
SYCL:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-sycl.h
- ggml/src/ggml-sycl.cpp
- ggml-sycl.h
- ggml-sycl.cpp
- README-sycl.md
Nvidia GPU:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-cuda.h
- ggml/src/ggml-cuda/**
- ggml-cuda.h
- ggml-cuda/**
Vulkan:
- changed-files:
- any-glob-to-any-file:
- ggml/ggml_vk_generate_shaders.py
- ggml/src/ggml-vulkan*
- ggml_vk_generate_shaders.py
- ggml-vulkan*
documentation:
- changed-files:
- any-glob-to-any-file:
@@ -42,6 +42,7 @@ build:
- cmake/**
- CMakeLists.txt
- CMakePresets.json
- codecov.yml
examples:
- changed-files:
- any-glob-to-any-file: examples/**
@@ -73,10 +74,10 @@ server:
ggml:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml*.h
- ggml/src/ggml*.c
- ggml/src/ggml*.cpp
- ggml/src/ggml*.h
- ggml.c
- ggml.h
- ggml-*.c
- ggml-*.h
- ggml-cuda/**
nix:
- changed-files:

View File

@@ -1,7 +0,0 @@
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
- Self-reported review complexity:
- [ ] Low
- [ ] Medium
- [ ] High

View File

@@ -109,7 +109,7 @@ jobs:
run: |
set -eux
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DLLAMA_CUBLAS=ON \
@@ -119,7 +119,7 @@ jobs:
-DLLAMA_FATAL_WARNINGS=OFF \
-DLLAMA_ALL_WARNINGS=OFF \
-DCMAKE_BUILD_TYPE=Release;
cmake --build build --config Release -j $(nproc) --target llama-server
cmake --build build --config Release -j $(nproc) --target server
- name: Download the dataset
id: download_dataset

View File

@@ -10,10 +10,10 @@ on:
push:
branches:
- master
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -47,7 +47,7 @@ jobs:
sysctl -a
mkdir build
cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@@ -84,7 +84,7 @@ jobs:
name: llama-bin-macos-arm64.zip
macOS-latest-cmake-x64:
runs-on: macos-12
runs-on: macos-latest
steps:
- name: Clone
@@ -103,10 +103,12 @@ jobs:
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
id: cmake_test
@@ -222,7 +224,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
cmake --build . --config Release -j $(nproc)
- name: Test
@@ -239,8 +241,8 @@ jobs:
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
echo "Fetch llama2c model"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
- name: Determine tag name
id: tag
@@ -305,7 +307,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
- name: Test
@@ -335,7 +337,7 @@ jobs:
run: |
mkdir build
cd build
cmake -DGGML_RPC=ON ..
cmake -DLLAMA_RPC=ON ..
cmake --build . --config Release -j $(nproc)
- name: Test
@@ -363,7 +365,7 @@ jobs:
run: |
mkdir build
cd build
cmake -DGGML_VULKAN=ON ..
cmake -DLLAMA_VULKAN=ON ..
cmake --build . --config Release -j $(nproc)
ubuntu-22-cmake-hip:
@@ -384,13 +386,13 @@ jobs:
- name: Build with native CMake HIP support
id: cmake_build
run: |
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
cmake --build build --config Release -j $(nproc)
- name: Build with legacy HIP support
id: cmake_build_legacy_hip
run: |
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
cmake --build build2 --config Release -j $(nproc)
ubuntu-22-cmake-sycl:
@@ -431,7 +433,7 @@ jobs:
source /opt/intel/oneapi/setvars.sh
mkdir build
cd build
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
cmake --build . --config Release -j $(nproc)
ubuntu-22-cmake-sycl-fp16:
@@ -472,10 +474,10 @@ jobs:
source /opt/intel/oneapi/setvars.sh
mkdir build
cd build
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
cmake --build . --config Release -j $(nproc)
# TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
macOS-latest-make:
@@ -497,15 +499,15 @@ jobs:
env:
LLAMA_FATAL_WARNINGS: 1
run: |
GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
- name: Test
id: make_test
run: |
GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
GGML_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
LLAMA_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
# would be great if we fix these
@@ -529,7 +531,7 @@ jobs:
sysctl -a
mkdir build
cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@@ -559,14 +561,13 @@ jobs:
mkdir build
cd build
cmake -G Xcode .. \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
macOS-latest-cmake-tvos:
runs-on: macos-latest
@@ -589,14 +590,13 @@ jobs:
mkdir build
cd build
cmake -G Xcode .. \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=tvOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
macOS-latest-swift:
runs-on: macos-latest
@@ -664,7 +664,7 @@ jobs:
- name: Build using make w/ OpenBLAS
shell: msys2 {0}
run: |
make GGML_OPENBLAS=1 -j $(nproc)
make LLAMA_OPENBLAS=1 -j $(nproc)
- name: Build using CMake
shell: msys2 {0}
@@ -680,11 +680,11 @@ jobs:
- name: Build using CMake w/ OpenBLAS
shell: msys2 {0}
run: |
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake:
runs-on: windows-2019
runs-on: windows-latest
env:
OPENBLAS_VERSION: 0.3.23
@@ -695,25 +695,25 @@ jobs:
matrix:
include:
- build: 'rpc-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
- build: 'noavx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx2-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'avx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'openblas-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
- build: 'vulkan-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
- build: 'llvm-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'msvc-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
steps:
- name: Clone
@@ -726,7 +726,7 @@ jobs:
id: clone_kompute
if: ${{ matrix.build == 'kompute-x64' }}
run: |
git submodule update --init ggml/src/kompute
git submodule update --init kompute
- name: Download OpenBLAS
id: get_openblas
@@ -799,7 +799,6 @@ jobs:
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
cd build
$env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
- name: Determine tag name
@@ -830,7 +829,7 @@ jobs:
name: llama-bin-win-${{ matrix.build }}.zip
windows-latest-cmake-cuda:
runs-on: windows-2019
runs-on: windows-latest
strategy:
matrix:
@@ -844,9 +843,8 @@ jobs:
with:
fetch-depth: 0
- name: Install CUDA toolkit
- uses: Jimver/cuda-toolkit@v0.2.11
id: cuda-toolkit
uses: Jimver/cuda-toolkit@v0.2.15
with:
cuda: ${{ matrix.cuda }}
method: 'network'
@@ -857,7 +855,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Determine tag name
@@ -990,7 +988,7 @@ jobs:
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
cmake --build build --config Release
ios-xcode-build:

40
.github/workflows/code-coverage.yml vendored Normal file
View File

@@ -0,0 +1,40 @@
name: Code Coverage
on: [push, pull_request]
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs:
run:
runs-on: ubuntu-20.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential gcc-8 lcov
- name: Build
run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
- name: Run tests
run: CC=gcc-8 make test
- name: Generate coverage report
run: |
make coverage
make lcov-report
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
with:
files: lcov-report/coverage.info

View File

@@ -10,11 +10,10 @@
name: Publish Docker image
on:
#pull_request:
pull_request:
push:
branches:
- master
paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -23,7 +22,7 @@ concurrency:
jobs:
push_to_registry:
name: Push Docker image to Docker Hub
#if: github.event.pull_request.draft == false
if: github.event.pull_request.draft == false
runs-on: ubuntu-latest
env:
@@ -31,18 +30,20 @@ jobs:
strategy:
matrix:
config:
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
# have disabled them for now until the reason why
# is understood.
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
steps:
- name: Check out the repo
uses: actions/checkout@v4

View File

@@ -16,9 +16,11 @@ on:
branches:
- master
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
pull_request:
pull_request_target:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
schedule:
- cron: '2 4 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@@ -30,7 +32,7 @@ jobs:
strategy:
matrix:
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [RelWithDebInfo]
include:
- build_type: Release
@@ -87,30 +89,16 @@ jobs:
exit 1
fi
- name: Build (no OpenMP)
id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_OPENMP=OFF ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build
id: cmake_build
if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
- name: Tests
id: server_integration_tests
@@ -127,7 +115,7 @@ jobs:
server-windows:
runs-on: windows-2019
runs-on: windows-latest
steps:
- name: Clone
@@ -150,7 +138,7 @@ jobs:
id: cmake_build
run: |
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
- name: Python setup
id: setup_python

154
.gitignore vendored
View File

@@ -1,125 +1,129 @@
# Extensions
*.o
*.a
*.bat
*.bin
*.dll
*.dot
*.etag
*.exe
*.gcda
*.gcno
*.gcov
*.so
*.gguf
*.gguf.json
*.lastModified
*.bin
*.exe
*.dll
*.log
*.metallib
*.o
*.so
*.gcov
*.gcno
*.gcda
*.dot
*.bat
*.tmp
# IDE / OS
*.metallib
*.etag
*.lastModified
.DS_Store
.build/
.cache/
.ccls-cache/
.direnv/
.DS_Store
.envrc
.idea/
.swiftpm
.venv
.clang-tidy
.vs/
.vscode/
nppBackup
.idea/
ggml-metal-embed.metal
# Coverage
gcovr-report/
lcov-report/
# Build Artifacts
gcovr-report/
tags
.build/
build*
!build-info.cmake
!build-info.cpp.in
!build-info.sh
!build.zig
/libllama.so
/llama-*
android-ndk-*
arm_neon.h
cmake-build-*
CMakeSettings.json
compile_commands.json
ggml-metal-embed.metal
llama-batched-swift
/rpc-server
android-ndk-*
out/
tmp/
# CI
!.github/workflows/*.yml
# Models
models/*
models-mnt
!models/.editorconfig
!models/ggml-vocab-*.gguf*
# Zig
/Pipfile
/baby-llama
/beam-search
/benchmark-matmult
/convert-llama2c-to-ggml
/embd-input-test
/embedding
/eval-callback
/gguf
/gguf-llama-simple
/gguf-split
/gritlm
/imatrix
/infill
/libllama.so
/llama-bench
/llava-cli
/lookahead
/lookup
/lookup-create
/lookup-merge
/lookup-stats
/main
/metal
/passkey
/perplexity
/q8dot
/quantize
/quantize-stats
/result
/save-load-state
/server
/simple
/batched
/batched-bench
/export-lora
/finetune
/retrieval
/speculative
/parallel
/train-text-from-scratch
/tokenize
/vdot
/common/build-info.cpp
arm_neon.h
compile_commands.json
CMakeSettings.json
__pycache__
dist
zig-out/
zig-cache/
# Logs
ppl-*.txt
qnt-*.txt
perf-*.txt
# Examples
examples/jeopardy/results.txt
examples/server/*.css.hpp
examples/server/*.html.hpp
examples/server/*.js.hpp
examples/server/*.mjs.hpp
!build_64.sh
!examples/*.bat
!examples/*/*.kts
!examples/*/*/*.kts
!examples/sycl/*.bat
!examples/sycl/*.sh
examples/server/*.css.hpp
# Python
/.venv
__pycache__/
*/poetry.lock
poetry.lock
poetry.toml
# Nix
/result
nppBackup
# Test binaries
/tests/test-backend-ops
/tests/test-double-float
/tests/test-grad0
/tests/test-grammar-parser
/tests/test-llama-grammar
/tests/test-double-float
/tests/test-grad0
/tests/test-opt
/tests/test-quantize-fns
/tests/test-quantize-perf
/tests/test-rope
/tests/test-sampling
/tests/test-tokenizer-0
/tests/test-tokenizer-1-bpe
/tests/test-tokenizer-1-spm
# Scripts
!/scripts/install-oneapi.bat
/tests/test-tokenizer-1-bpe
/tests/test-rope
/tests/test-backend-ops

2
.gitmodules vendored
View File

@@ -1,3 +1,3 @@
[submodule "kompute"]
path = ggml/src/kompute
path = kompute
url = https://github.com/nomic-ai/kompute.git

129
AUTHORS
View File

@@ -1,9 +1,8 @@
# date: Wed Jun 26 19:36:34 EEST 2024
# date: Tue Apr 9 09:17:14 EEST 2024
# this file is auto-generated by scripts/gen-authors.sh
0cc4m <picard12@live.de>
0xspringtime <110655352+0xspringtime@users.noreply.github.com>
20kdc <asdd2808@gmail.com>
2f38b454 <dxf@protonmail.com>
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
44670 <44670@users.noreply.github.com>
@@ -12,18 +11,14 @@ AT <manyoso@users.noreply.github.com>
Aarni Koskela <akx@iki.fi>
Aaron Miller <apage43@ninjawhale.com>
Aaryaman Vasishta <aaryaman.vasishta@amd.com>
Abheek Gulati <abheekg@hotmail.com>
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
Adithya Balaji <adithya.b94@gmail.com>
AdithyanI <adithyan.i4internet@gmail.com>
Adrian <smith.adriane@gmail.com>
Adrian Hesketh <a-h@users.noreply.github.com>
Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
Aisuko <urakiny@gmail.com>
Akarshan Biswas <akarshanbiswas@fedoraproject.org>
Albert Jin <albert.jin@gmail.com>
Alberto <57916483+albbus-stack@users.noreply.github.com>
Alex <awhill19@icloud.com>
Alex Azarov <alex@azarov.by>
@@ -40,24 +35,19 @@ Ali Nehzat <ali.nehzat@thanks.dev>
Ali Tariq <ali.tariq@10xengineers.ai>
Alon <alonfaraj@gmail.com>
AlpinDale <52078762+AlpinDale@users.noreply.github.com>
Amir <amir_zia@outlook.com>
AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
Ananta Bastola <anantarajbastola@gmail.com>
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
András Salamon <ott2@users.noreply.github.com>
Andrei <abetlen@gmail.com>
Andrew Canis <andrew.canis@gmail.com>
Andrew Downing <andrew2085@gmail.com>
Andrew Duffy <a10y@users.noreply.github.com>
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
Andy Tai <andy-tai@users.noreply.github.com>
Arik Poznanski <arikpoz@users.noreply.github.com>
Artem <guinmoon@gmail.com>
Artem Zinnatullin <ceo@abstractny.gay>
Artyom Lebedev <vagran.ast@gmail.com>
Asbjørn Olling <asbjornolling@gmail.com>
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
Ashish <1856117+ashishdatta@users.noreply.github.com>
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
Ashraful Islam <ashraful.meche@gmail.com>
Atsushi Tatsuma <yoshoku@outlook.com>
@@ -67,46 +57,35 @@ BADR <contact@pythops.com>
Bach Le <bach@bullno1.com>
Bailey Chittle <39804642+bachittle@users.noreply.github.com>
BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
Bartowski <ckealty1182@gmail.com>
Behnam M <58621210+ibehnam@users.noreply.github.com>
Ben Ashbaugh <ben.ashbaugh@intel.com>
Ben Garney <bengarney@users.noreply.github.com>
Ben Siraphob <bensiraphob@gmail.com>
Ben Williams <ben@719ben.com>
Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
Bernat Vadell <hounter.caza@gmail.com>
Bingan <70050083+binganao@users.noreply.github.com>
Bodo Graumann <mail@bodograumann.de>
Bono Lv <lvscar@users.noreply.github.com>
Borislav Stanimirov <b.stanimirov@abv.bg>
Branden Butler <bwtbutler@hotmail.com>
Brian <mofosyne@gmail.com>
Bruce MacDonald <brucewmacdonald@gmail.com>
Bryan Honof <bryanhonof@gmail.com>
CJ Pais <cj@cjpais.com>
CRD716 <crd716@gmail.com>
Calvin Laurenson <calvin@laurenson.dev>
Cameron <csteele@steelecameron.com>
Cameron Kaiser <classilla@users.noreply.github.com>
Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
Casey Primozic <casey@cprimozic.net>
Casey Primozic <me@ameo.link>
CausalLM <148736309+CausalLM@users.noreply.github.com>
Cebtenzzre <cebtenzzre@gmail.com>
Chad Brewbaker <crb002@gmail.com>
Chao Jiang <jc19chaoj@zoho.com>
Cheng Shao <terrorjack@type.dance>
Chris Elrod <elrodc@gmail.com>
Chris Kuehl <ckuehl@ckuehl.me>
Christian Demsar <christian@github.email.demsar.us>
Christian Demsar <crasm@git.vczf.us>
Christian Falch <875252+chrfalch@users.noreply.github.com>
Christian Kögler <ck3d@gmx.de>
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
Clark Saben <76020733+csaben@users.noreply.github.com>
Clint Herron <hanclinto@gmail.com>
CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
DAN™ <dranger003@gmail.com>
Damian Stewart <d@damianstewart.com>
@@ -116,12 +95,8 @@ Daniel Bevenius <daniel.bevenius@gmail.com>
Daniel Drake <drake@endlessos.org>
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Daniel Illescas Romero <illescas.daniel@protonmail.com>
Daniele <57776841+daniandtheweb@users.noreply.github.com>
DannyDaemonic <DannyDaemonic@gmail.com>
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
Dave <dave-fl@users.noreply.github.com>
Dave Airlie <airlied@gmail.com>
Dave Airlie <airlied@redhat.com>
Dave Della Costa <ddellacosta+github@gmail.com>
David Friehs <david@friehs.info>
David Kennedy <dakennedyd@gmail.com>
@@ -129,13 +104,10 @@ David Pflug <david@pflug.email>
David Renshaw <dwrenshaw@gmail.com>
David Sommers <12738+databyte@users.noreply.github.com>
David Yang <davidyang6us@gmail.com>
Dawid Potocki <github@dawidpotocki.com>
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
Dean <Dean.Sinaean@gmail.com>
Deins <deinsegle@gmail.com>
Deven Mistry <31466137+deven367@users.noreply.github.com>
Didzis Gosko <didzis@users.noreply.github.com>
Djip007 <djip.perois@free.fr>
Don Mahurin <dmahurin@users.noreply.github.com>
DooWoong Lee (David) <manics99@naver.com>
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
@@ -144,11 +116,8 @@ Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
Ebey Abraham <ebey97@gmail.com>
Ed Lee <edilee@mozilla.com>
Ed Lepedus <ed.lepedus@googlemail.com>
Eddie-Wang <wangjinheng1120@163.com>
Edward Taylor <edeetee@gmail.com>
Elaine <elaine.zosa@gmail.com>
Elbios <141279586+Elbios@users.noreply.github.com>
Elton Kola <eltonkola@gmail.com>
Engininja2 <139037756+Engininja2@users.noreply.github.com>
Equim <sayaka@ekyu.moe>
Eric Sommerlade <es0m@users.noreply.github.com>
@@ -174,47 +143,37 @@ Firat <firatkiral@gmail.com>
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
Frank Mai <thxcode0824@gmail.com>
FrankHB <frankhb1989@gmail.com>
Fred Douglas <43351173+fredlas@users.noreply.github.com>
Frederik Vogel <Schaltfehler@users.noreply.github.com>
Gabe Goodhart <gabe.l.hart@gmail.com>
GainLee <perfecter.gen@gmail.com>
Galunid <karolek1231456@gmail.com>
Gary Linscott <glinscott@gmail.com>
Gary Mulder <gjmulder@gmail.com>
Gavin Zhao <gavinzhaojw@protonmail.com>
Genkagaku.GPT <hlhr202@163.com>
Georgi Gerganov <ggerganov@gmail.com>
Gilad S <giladgd@users.noreply.github.com>
Giuseppe Scrivano <giuseppe@scrivano.org>
GiviMAD <GiviMAD@users.noreply.github.com>
Govlzkoy <gotope@users.noreply.github.com>
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
Guillaume Wenzek <gwenzek@users.noreply.github.com>
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
Haggai Nuchi <h.nuchi@gmail.com>
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com>
HanishKVC <hanishkvc@gmail.com>
Haohui Mai <ricetons@gmail.com>
Haoxiang Fei <tonyfettes@tonyfettes.com>
Harald Fernengel <harald.fernengel@here.com>
Hatsune Miku <129688334+at8u@users.noreply.github.com>
HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com>
Henk Poley <HenkPoley@gmail.com>
Henri Vasserman <henv@hot.ee>
Henrik Forstén <henrik.forsten@gmail.com>
Herman Semenov <GermanAizek@yandex.ru>
Hesen Peng <hesen.peng@gmail.com>
Hoang Nguyen <hugo53@users.noreply.github.com>
Hong Bo PENG <penghb@cn.ibm.com>
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
Howard Su <howard0su@gmail.com>
Hua Jiang <allenhjiang@outlook.com>
Huawei Lin <huaweilin.cs@gmail.com>
Hugo Roussel <hugo.rous@gmail.com>
Ian Bull <irbull@eclipsesource.com>
Ian Bull <irbull@gmail.com>
Ian Scrivener <github@zilogy.asia>
@@ -231,10 +190,8 @@ Ivan Stepanov <ivanstepanovftw@gmail.com>
JH23X <165871467+JH23X@users.noreply.github.com>
Jack Mousseau <jmousseau@users.noreply.github.com>
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
Jaemin Son <woalsdnd@gmail.com>
Jag Chadha <jagtesh@gmail.com>
Jakub N <jakubniemczyk97@gmail.com>
James A Capozzoli <157492257+jac-jim@users.noreply.github.com>
James Reynolds <magnusviri@users.noreply.github.com>
Jan Boon <jan.boon@kaetemi.be>
Jan Boon <kaetemi@gmail.com>
@@ -248,17 +205,12 @@ Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
Jed Fox <git@jedfox.com>
Jeffrey Quesnelle <emozilla@nousresearch.com>
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
Jeximo <jeximo@gmail.com>
Jhen-Jie Hong <iainst0409@gmail.com>
Jiahao Li <liplus17@163.com>
Jian Liao <jianliao@users.noreply.github.com>
JidongZhang-THU <1119708529@qq.com>
Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
Jiří Sejkora <Sejseloid@gmail.com>
Joan Fontanals <jfontanalsmartinez@gmail.com>
Joan Fontanals <joan.fontanals.martinez@jina.ai>
Johan <JohanAR@users.noreply.github.com>
Johannes Gäßler <johannesg@5d6.de>
Johannes Rudolph <johannes.rudolph@gmail.com>
John <78893154+cmp-nct@users.noreply.github.com>
@@ -269,19 +221,15 @@ Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
Jorge A <161275481+jorgealias@users.noreply.github.com>
Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
Joseph Stahl <1269177+josephst@users.noreply.github.com>
Josh Ramer <josh.ramer@icloud.com>
Joyce <joycebrum@google.com>
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
Judd <foldl@users.noreply.github.com>
Julius Arkenberg <arki05@users.noreply.github.com>
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
Junyang Lin <justinlin930319@hotmail.com>
Juraj Bednar <juraj@bednar.io>
Justin Parker <jparkerweb@gmail.com>
Justin Suess <justin.suess@westpoint.edu>
Justina Cho <justcho5@gmail.com>
Justine Tunney <jtunney@gmail.com>
Justine Tunney <jtunney@mozilla.com>
Juuso Alasuutari <juuso.alasuutari@gmail.com>
KASR <karim.asrih@gmail.com>
Kamil Tomšík <info@tomsik.cz>
@@ -294,7 +242,6 @@ Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Keiichi Tabata <keiichi.tabata@outlook.com>
Kenvix ⭐ <kenvixzure@live.com>
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Kevin Gibbons <bakkot@gmail.com>
Kevin Ji <1146876+kevinji@users.noreply.github.com>
Kevin Kwok <antimatter15@gmail.com>
Kevin Lo <kevlo@kevlo.org>
@@ -310,7 +257,6 @@ Laura <Tijntje_7@msn.com>
Lee <44310445+lx200916@users.noreply.github.com>
Lee Drake <b.lee.drake@gmail.com>
Leng Yue <lengyue@lengyue.me>
Leon Knauer <git@leonknauer.com>
LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
Leonardo Neumann <leonardo@neumann.dev.br>
Li Tan <tanliboy@gmail.com>
@@ -319,26 +265,20 @@ LoganDark <github@logandark.mozmail.com>
LostRuins <39025047+LostRuins@users.noreply.github.com>
Luciano <lucianostrika44@gmail.com>
Luo Tian <lt@basecity.com>
Lyle Dean <dean@lyle.dev>
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
Maarten ter Huurne <maarten@treewalker.org>
Mack Straight <eiz@users.noreply.github.com>
Maël Kerbiriou <m431.kerbiriou@gmail.com>
MaggotHATE <clay1326@gmail.com>
Manuel <44313466+makuche@users.noreply.github.com>
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
Marco Matthies <71844+marcom@users.noreply.github.com>
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
Marian Cepok <marian.cepok@gmail.com>
Mark Fairbairn <thebaron88@gmail.com>
Marko Tasic <mtasic85@gmail.com>
Markus Tavenrath <mtavenrath@users.noreply.github.com>
Martin Delille <martin@delille.org>
Martin Krasser <krasserm@googlemail.com>
Martin Schwaighofer <mschwaig@users.noreply.github.com>
Marvin Gießing <marvin.giessing@gmail.com>
Masaya, Kato <62578291+msy-kato@users.noreply.github.com>
MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
Matheus C. França <matheus-catarino@hotmail.com>
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
@@ -347,11 +287,8 @@ Mathijs de Bruin <mathijs@mathijsfietst.nl>
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
Matt Pulver <matt.pulver@heavy.ai>
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
Mattheus Chediak <shammcity00@gmail.com>
Matthew Tejo <matthew.tejo@gmail.com>
Matvey Soloviev <blackhole89@gmail.com>
Max Krasnyansky <max.krasnyansky@gmail.com>
Max Krasnyansky <quic_maxk@quicinc.com>
Maxime <672982+maximegmd@users.noreply.github.com>
Maximilian Winter <maximilian.winter.91@gmail.com>
Meng Zhang <meng@tabbyml.com>
@@ -363,41 +300,32 @@ Michael Kesper <mkesper@schokokeks.org>
Michael Klimenko <mklimenko29@gmail.com>
Michael Podvitskiy <podvitskiymichael@gmail.com>
Michael Potter <NanoTekGuy@Gmail.com>
Michael de Gans <michael.john.degans@gmail.com>
Michaël de Vries <vriesdemichael@gmail.com>
Mihai <mihai.chirculescu@yahoo.com>
Mike <ytianhui2004@gmail.com>
Mikko Juola <mikjuo@gmail.com>
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
Mirko185 <mirkosig@gmail.com>
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
Murilo Santana <mvrilo@gmail.com>
Musab Gultekin <musabgultekin@users.noreply.github.com>
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
Nathan Epstein <nate2@umbc.edu>
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
Nebula <infinitewormhole@gmail.com>
Neo Zhang <14088817+arthw@users.noreply.github.com>
Neo Zhang <zhang.jianyu@outlook.com>
Neo Zhang Jianyu <jianyu.zhang@intel.com>
Neuman Vong <neuman.vong@gmail.com>
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
Niall Coates <1349685+Niall-@users.noreply.github.com>
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
Nicolás Pérez <nicolas_perez@brown.edu>
Nigel Bosch <pnigelb@gmail.com>
Niklas Korz <niklas@niklaskorz.de>
Nikolas <127742645+nneubacher@users.noreply.github.com>
Nindaleth <Nindaleth@users.noreply.github.com>
Oleksandr Nikitin <oleksandr@tvori.info>
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
Olivier Chafik <ochafik@users.noreply.github.com>
Ondřej Čertík <ondrej@certik.us>
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
Patrice Ferlet <metal3d@gmail.com>
Paul Tsochantaris <ptsochantaris@icloud.com>
Pavol Rusnak <pavol@rusnak.io>
Pedro Cuenca <pedro@huggingface.co>
@@ -415,14 +343,9 @@ RJ Adriaansen <adriaansen@eshcc.eur.nl>
Radoslav Gerganov <rgerganov@gmail.com>
Radosław Gryta <radek.gryta@gmail.com>
Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
Raj Hammeer Singh Hada <hammeerraj@gmail.com>
Ralph Soika <ralph.soika@imixs.com>
Rand Xie <randxiexyy29@gmail.com>
Randall Fitzgerald <randall@dasaku.net>
Reinforce-II <fate@eastal.com>
Ren Xuancheng <jklj077@users.noreply.github.com>
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
RhinoDevel <RhinoDevel@users.noreply.github.com>
Riceball LEE <snowyu.lee@gmail.com>
Richard Kiss <him@richardkiss.com>
Richard Roberson <richardr1126@gmail.com>
@@ -450,7 +373,6 @@ Rowan Hart <rowanbhart@gmail.com>
Rune <43761327+Rune-AI@users.noreply.github.com>
Ryan Landay <rlanday@gmail.com>
Ryder Wishart <ryderwishart@gmail.com>
Ryuei <louixs@users.noreply.github.com>
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
SakuraUmi <yukinon244@gmail.com>
Salvador E. Tropea <stropea@inti.gob.ar>
@@ -464,7 +386,6 @@ SebastianApel <13675545+SebastianApel@users.noreply.github.com>
Senemu <10880819+Senemu@users.noreply.github.com>
Sergey Alirzaev <zl29ah@gmail.com>
Sergio López <slp@sinrega.org>
Sertaç Özercan <852750+sozercan@users.noreply.github.com>
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
ShadovvBeast <ShadovvBeast@gmail.com>
Shakhar Dasgupta <shakhardasgupta@gmail.com>
@@ -473,7 +394,6 @@ Shijie <821898965@qq.com>
Shintarou Okada <kokuzen@gmail.com>
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
Shouzheng Liu <lshzh.hi@gmail.com>
Shuichi Tsutsumi <shuichi0526@gmail.com>
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Simon Willison <swillison@gmail.com>
Siwen Yu <yusiwen@gmail.com>
@@ -485,14 +405,11 @@ Someone <sergei.kozlukov@aalto.fi>
Someone Serge <sergei.kozlukov@aalto.fi>
Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Spencer Sutton <spencersutton@users.noreply.github.com>
Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com>
Srinivas Billa <nivibilla@gmail.com>
Stefan Sydow <stefan@sydow.email>
Steffen Röcker <sroecker@gmail.com>
Stephan Walter <stephan@walter.name>
Stephen Nichols <snichols@users.noreply.github.com>
Steve Grubb <ausearch.1@gmail.com>
Steven Prichard <spprichard20@gmail.com>
Steven Roussey <sroussey@gmail.com>
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
@@ -517,19 +434,16 @@ Tom C <tom.corelis@gmail.com>
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Tomas <tom.tomas.36478119@gmail.com>
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
Tristan Druyen <tristan@vault81.mozmail.com>
Tristan Ross <rosscomputerguy@protonmail.com>
Tungsten842 <886724vf@anonaddy.me>
Tungsten842 <quantmint@protonmail.com>
Tushar <ditsuke@protonmail.com>
UEXTM.com <84163508+uextm@users.noreply.github.com>
Ulrich Drepper <drepper@gmail.com>
Uzo Nweke <uzoechi@gmail.com>
Vaibhav Srivastav <vaibhavs10@gmail.com>
Val Kharitonov <mail@kharvd.com>
Valentin Konovalov <valle.ketsujin@gmail.com>
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
Victor Nogueira <felladrin@gmail.com>
Victor Z. Peng <ziliangdotme@gmail.com>
Vlad <spitfireage@gmail.com>
Vladimir <bogdad@gmail.com>
@@ -541,9 +455,7 @@ Weird Constructor <weirdconstructor@gmail.com>
Welby Seely <welbyseely@gmail.com>
Wentai Zhang <rchardx@gmail.com>
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
William Tambellini <william.tambellini@gmail.com>
Willy Tarreau <w@1wt.eu>
Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com>
Wu Jian Ping <wujjpp@hotmail.com>
Wu Jian Ping <wujp@greatld.com>
Xiake Sun <xiake.sun@intel.com>
@@ -554,8 +466,6 @@ Xiaoyi Chen <cxychina@gmail.com>
Xingchen Song(宋星辰) <xingchensong1996@163.com>
Xuan Son Nguyen <thichthat@gmail.com>
Yann Follet <131855179+YannFollet@users.noreply.github.com>
Yaroslav <yaroslav.yashin@me.com>
Yazan Agha-Schrader <mountaiin@icloud.com>
Yiming Cui <conandiy@vip.qq.com>
Yishuo Wang <MeouSker77@outlook.com>
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
@@ -567,7 +477,6 @@ Zane Shannon <z@zcs.me>
Zay <95888118+isaiahbjork@users.noreply.github.com>
Zenix <zenixls2@gmail.com>
Zhang Peiyuan <a1286225768@gmail.com>
Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
ZhouYuChen <zhouyuchen@naver.com>
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
@@ -575,18 +484,14 @@ Zsapi <martin1.zsapka@gmail.com>
a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
adel boussaken <netdur@gmail.com>
afrideva <95653597+afrideva@users.noreply.github.com>
agray3 <agray3@users.noreply.github.com>
akawrykow <142945436+akawrykow@users.noreply.github.com>
alexpinel <93524949+alexpinel@users.noreply.github.com>
alonfaraj <alonfaraj@gmail.com>
alwqx <kenan3015@gmail.com>
amd-lalithnc <lalithnc@amd.com>
andrijdavid <david@geek.mg>
anon998 <131767832+anon998@users.noreply.github.com>
anzz1 <anzz1@live.com>
apaz <aarpazdera@gmail.com>
apcameron <37645737+apcameron@users.noreply.github.com>
arch-btw <57669023+arch-btw@users.noreply.github.com>
arcrank <arcrank@gmail.com>
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
at8u <129688334+at8u@users.noreply.github.com>
@@ -609,17 +514,13 @@ cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
coezbek <c.oezbek@gmail.com>
comex <comexk@gmail.com>
compilade <113953597+compilade@users.noreply.github.com>
compilade <git@compilade.net>
cpumaxx <163466046+cpumaxx@users.noreply.github.com>
crasm <crasm@git.vczf.net>
crasm <crasm@git.vczf.us>
daboe01 <daboe01@googlemail.com>
david raistrick <keen99@users.noreply.github.com>
ddh0 <dylanhalladay02@icloud.com>
ddpasa <112642920+ddpasa@users.noreply.github.com>
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
divinity76 <divinity76@gmail.com>
dm4 <sunrisedm4@gmail.com>
dotpy314 <33351922+dotpy314@users.noreply.github.com>
drbh <david.richard.holtz@gmail.com>
ds5t5 <145942675+ds5t5@users.noreply.github.com>
@@ -628,7 +529,6 @@ eastriver <lee@eastriver.dev>
ebraminio <ebraminio@gmail.com>
eiery <19350831+eiery@users.noreply.github.com>
eric8607242 <e0928021388@gmail.com>
fairydreaming <166155368+fairydreaming@users.noreply.github.com>
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
gliptic <gliptic@users.noreply.github.com>
@@ -639,7 +539,6 @@ h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
hankcs <cnhankmc@gmail.com>
hoangmit <hoangmit@users.noreply.github.com>
hongbo.mo <352280764@qq.com>
hopkins385 <98618192+hopkins385@users.noreply.github.com>
howlger <eclipse@voormann.de>
howlger <github@voormann.de>
hutli <6594598+hutli@users.noreply.github.com>
@@ -650,22 +549,14 @@ hydai <z54981220@gmail.com>
iSma <ismail.senhaji@gmail.com>
iacore <74560659+iacore@users.noreply.github.com>
igarnier <igarnier@protonmail.com>
intelmatt <61025942+intelmatt@users.noreply.github.com>
iohub <rickyang.pro@gmail.com>
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
jameswu2014 <545426914@qq.com>
jiez <373447296@qq.com>
jneem <joeneeman@gmail.com>
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
johnson442 <56517414+johnson442@users.noreply.github.com>
jojorne <jojorne@users.noreply.github.com>
jon-chuang <9093549+jon-chuang@users.noreply.github.com>
jp-x-g <jpxg-dev@protonmail.com>
jukofyork <69222624+jukofyork@users.noreply.github.com>
junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
jwj7140 <32943891+jwj7140@users.noreply.github.com>
k.h.lai <adrian.k.h.lai@outlook.com>
kaizau <kaizau@users.noreply.github.com>
kalomaze <66376113+kalomaze@users.noreply.github.com>
kang <tpdns9032100@gmail.com>
@@ -684,15 +575,11 @@ ldwang <ftgreat@163.com>
le.chang <cljs118@126.com>
leejet <leejet714@gmail.com>
limitedAtonement <limitedAtonement@users.noreply.github.com>
liuwei-git <14815172+liuwei-git@users.noreply.github.com>
lon <114724657+longregen@users.noreply.github.com>
loonerin <132926317+loonerin@users.noreply.github.com>
luoyu-intel <yu.luo@intel.com>
m3ndax <adrian.goessl@outlook.com>
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
makomk <makosoft@googlemail.com>
manikbhandari <mbbhandarimanik2@gmail.com>
maor-ps <154728172+maor-ps@users.noreply.github.com>
mdrokz <mohammadmunshi@gmail.com>
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
minarchist <minarchist@users.noreply.github.com>
@@ -706,19 +593,15 @@ ngc92 <7938269+ngc92@users.noreply.github.com>
nhamanasu <45545786+nhamanasu@users.noreply.github.com>
niansa/tuxifan <anton-sa@web.de>
niansa/tuxifan <tuxifan@posteo.de>
nickp27 <nb.porter@gmail.com>
ningshanwutuobang <ningshanwutuobang@gmail.com>
nold <Nold360@users.noreply.github.com>
nopperl <54780682+nopperl@users.noreply.github.com>
nusu-github <29514220+nusu-github@users.noreply.github.com>
olexiyb <olexiyb@gmail.com>
omahs <73983677+omahs@users.noreply.github.com>
oobabooga <112222186+oobabooga@users.noreply.github.com>
opparco <parco.opaai@gmail.com>
ostix360 <55257054+ostix360@users.noreply.github.com>
pengxin99 <pengxin.yuan@intel.com>
perserk <perserk@gmail.com>
pmysl <piotr.myslinski@outlook.com>
postmasters <namnguyen@google.com>
pudepiedj <pudepiedj@gmail.com>
qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
@@ -731,19 +614,16 @@ rhuddleston <ryan.huddleston@percona.com>
rimoliga <53384203+rimoliga@users.noreply.github.com>
runfuture <runfuture@users.noreply.github.com>
sandyiscool <sandyiscool@gmail.com>
sasha0552 <admin@sasha0552.org>
semidark <me@semidark.net>
sharpHL <132747147+sharpHL@users.noreply.github.com>
shibe2 <shibe@tuta.io>
singularity <12184989+singularity-s0@users.noreply.github.com>
sjinzh <sjinzh@gmail.com>
sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
slaren <2141330+slaren@users.noreply.github.com>
slaren <slarengh@gmail.com>
snadampal <87143774+snadampal@users.noreply.github.com>
staviq <staviq@gmail.com>
stduhpf <stephduh@live.fr>
strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
swittk <switt1995@gmail.com>
takov751 <40316768+takov751@users.noreply.github.com>
tarcey <cey.tarik@gmail.com>
@@ -756,16 +636,12 @@ uint256_t <konndennsa@gmail.com>
uint256_t <maekawatoshiki1017@gmail.com>
unbounded <haakon@likedan.net>
valiray <133289098+valiray@users.noreply.github.com>
vik <vikhyatk@gmail.com>
viric <viric@viric.name>
vodkaslime <646329483@qq.com>
vvhg1 <94630311+vvhg1@users.noreply.github.com>
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
whoreson <139810751+whoreson@users.noreply.github.com>
woachk <24752637+woachk@users.noreply.github.com>
wonjun Jang <strutive07@gmail.com>
woodx <124784234+woodx9@users.noreply.github.com>
wzy <32936898+Freed-Wu@users.noreply.github.com>
xaedes <xaedes@gmail.com>
xaedes <xaedes@googlemail.com>
@@ -773,10 +649,7 @@ xloem <0xloem@gmail.com>
yangli2 <yangli2@gmail.com>
yuiseki <yuiseki@gmail.com>
zakkor <edward.partenie@gmail.com>
zhangkaihuo <zhangkaihuo@gmail.com>
zhouwg <6889919+zhouwg@users.noreply.github.com>
zhouwg <zhouwg2000@gmail.com>
zrm <trustiosity.zrm@gmail.com>
Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
源文雨 <41315874+fumiama@users.noreply.github.com>
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>

File diff suppressed because it is too large Load Diff

View File

@@ -11,23 +11,10 @@
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
}
},
{
"name": "sycl-base",
"hidden": true,
"generator": "Ninja",
"binaryDir": "${sourceDir}/build-${presetName}",
"cacheVariables": {
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"CMAKE_CXX_COMPILER": "icx",
"CMAKE_C_COMPILER": "cl",
"GGML_SYCL": "ON",
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
}
},
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
{
"name": "arm64-windows-msvc", "hidden": true,
@@ -48,18 +35,15 @@
},
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
]
}

View File

@@ -1,24 +0,0 @@
# Pull requests
- Always squash-merge the PR before merging
- Use the following format for your final commit: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Test your changes:
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
- Execute [the full CI locally on your machine](ci/README.md) before publishing
- If the pull request contains only documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience
# Coding guidelines
- Avoid adding third-party dependencies, extra files, extra headers, etc.
- Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
![matmul](media/matmul.png)

1123
Makefile

File diff suppressed because it is too large Load Diff

View File

@@ -3,13 +3,14 @@
import PackageDescription
var sources = [
"src/llama.cpp",
"src/unicode.cpp",
"src/unicode-data.cpp",
"ggml/src/ggml.c",
"ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.c",
"ggml/src/ggml-quants.c",
"ggml.c",
"sgemm.cpp",
"llama.cpp",
"unicode.cpp",
"unicode-data.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",
]
var resources: [Resource] = []
@@ -25,8 +26,8 @@ var cSettings: [CSetting] = [
]
#if canImport(Darwin)
sources.append("ggml/src/ggml-metal.m")
resources.append(.process("ggml/src/ggml-metal.metal"))
sources.append("ggml-metal.m")
resources.append(.process("ggml-metal.metal"))
linkerSettings.append(.linkedFramework("Accelerate"))
cSettings.append(
contentsOf: [
@@ -62,6 +63,8 @@ let package = Package(
"models",
"tests",
"CMakeLists.txt",
"ggml-cuda.cu",
"ggml-cuda.h",
"Makefile"
],
sources: sources,

View File

@@ -1,7 +1,6 @@
# llama.cpp for SYCL
- [Background](#background)
- [Recommended Release](#recommended-release)
- [News](#news)
- [OS](#os)
- [Hardware](#hardware)
@@ -32,23 +31,8 @@ When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneM
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
## Recommended Release
The SYCL backend would be broken by some PRs due to no online CI.
The following release is verified with good quality:
|Commit ID|Tag|Release|Verified Platform|
|-|-|-|-|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
## News
- 2024.5
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
- Arch Linux is verified successfully.
- 2024.4
- Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
@@ -93,7 +77,7 @@ The following release is verified with good quality:
*Notes:*
- **Memory**
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
@@ -115,14 +99,14 @@ The docker build option is currently limited to *intel GPU* targets.
### Build image
```sh
# Using FP16
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
```
*Notes*:
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
### Run container
@@ -244,10 +228,10 @@ source /opt/intel/oneapi/setvars.sh
# Build LLAMA with MKL BLAS acceleration for intel GPU
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# build all binary
cmake --build build --config Release -j -v
@@ -264,10 +248,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with Nvidia BLAS acceleration through SYCL
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# build all binary
cmake --build build --config Release -j -v
@@ -291,7 +275,7 @@ source /opt/intel/oneapi/setvars.sh
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
```sh
./build/bin/llama-ls-sycl-device
./build/bin/ls-sycl-device
```
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
```
@@ -329,7 +313,7 @@ Examples:
- Use device 0:
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
```
or run by script:
@@ -340,7 +324,7 @@ or run by script:
- Use multiple devices:
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
```
Otherwise, you can run the script:
@@ -410,9 +394,15 @@ Output (example):
4. Install build tools
a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
a. Download & install cmake for Windows: https://cmake.org/download/
b. Download & install mingw-w64 make for Windows provided by w64devkit
- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
- Extract `w64devkit` on your pc.
- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
### II. Build llama.cpp
@@ -422,10 +412,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
# Option 2: Or FP16
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
cmake --build build --config Release -j
```
@@ -435,23 +425,9 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
.\examples\sycl\win-build-sycl.bat
```
Or, use CMake presets to build:
```sh
cmake --preset x64-windows-sycl-release
cmake --build build-x64-windows-sycl-release -j --target llama-cli
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
cmake --build build-x64-windows-sycl-release -j --target llama-cli
cmake --preset x64-windows-sycl-debug
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
```
Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
*Notes:*
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.
### III. Run the inference
@@ -512,13 +488,13 @@ Examples:
- Use device 0:
```
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
```
- Use multiple devices:
```
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
```
Otherwise, run the following wrapper script:
@@ -544,9 +520,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| Name | Value | Function |
|--------------------|-----------------------------------|---------------------------------------------|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. |
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. |
| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |

186
README.md
View File

@@ -10,12 +10,8 @@
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
> [!IMPORTANT]
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
### Recent API changes
- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
@@ -26,7 +22,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Hot topics
- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
@@ -57,6 +53,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
<li><a href="#quantization">Quantization</a></li>
<li><a href="#interactive-mode">Interactive mode</a></li>
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
<li><a href="#instruct-mode">Instruct mode</a></li>
<li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
<li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
<li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
@@ -108,7 +105,6 @@ Typically finetunes of the base models below are supported as well.
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
@@ -197,7 +193,6 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
- [RAGNA Desktop](https://ragna.app/) (proprietary)
- [RecurseChat](https://recurse.chat/) (proprietary)
- [semperai/amica](https://github.com/semperai/amica)
- [withcatai/catai](https://github.com/withcatai/catai)
@@ -211,25 +206,19 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [AIKit](https://github.com/sozercan/aikit) (MIT)
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
**Tools:**
- [akx/ggify](https://github.com/akx/ggify) download PyTorch models from HuggingFace Hub and convert them to GGML
- [crashr/gppm](https://github.com/crashr/gppm) launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
**Infrastructure:**
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
---
Here is a typical run using LLaMA v2 13B on M2 Ultra:
```
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
I llama.cpp build info:
I UNAME_S: Darwin
I UNAME_P: arm
@@ -395,34 +384,10 @@ brew install llama.cpp
```
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
### Nix
On Mac and Linux, the Nix package manager can be used via
```
nix profile install nixpkgs#llama-cpp
```
For flake enabled installs.
Or
```
nix-env --file '<nixpkgs>' --install --attr llama-cpp
```
For non-flake enabled installs.
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
#### Flox
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
```
flox install llama-cpp
```
Flox follows the nixpkgs build of llama.cpp.
### Metal Build
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
argument.
@@ -442,7 +407,7 @@ Building the program with BLAS support may lead to some performance improvements
- Using `make`:
- On Linux:
```bash
make GGML_OPENBLAS=1
make LLAMA_OPENBLAS=1
```
- On Windows:
@@ -457,13 +422,13 @@ Building the program with BLAS support may lead to some performance improvements
8. From here you can run:
```bash
make GGML_OPENBLAS=1
make LLAMA_OPENBLAS=1
```
- Using `CMake` on Linux:
```bash
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cmake --build build --config Release
```
@@ -482,10 +447,10 @@ Building the program with BLAS support may lead to some performance improvements
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
- Using manual oneAPI installation:
By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
```bash
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
cmake --build build --config Release
```
@@ -502,28 +467,27 @@ Building the program with BLAS support may lead to some performance improvements
- Using `make`:
```bash
make GGML_CUDA=1
make LLAMA_CUDA=1
```
- Using `CMake`:
```bash
cmake -B build -DGGML_CUDA=ON
cmake -B build -DLLAMA_CUDA=ON
cmake --build build --config Release
```
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
| Option | Legal values | Default | Description |
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
| Option | Legal values | Default | Description |
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
| LLAMA_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
- #### hipBLAS
@@ -533,15 +497,15 @@ Building the program with BLAS support may lead to some performance improvements
- Using `make`:
```bash
make GGML_HIPBLAS=1
make LLAMA_HIPBLAS=1
```
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build --config Release -- -j 16
```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
Note that if you get the following error:
@@ -555,19 +519,19 @@ Building the program with BLAS support may lead to some performance improvements
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build -- -j 16
```
- Using `make` (example for target gfx1030, build with 16 CPU threads):
```bash
make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
```
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
```bash
set PATH=%HIP_PATH%\bin;%PATH%
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
cmake --build build
```
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
@@ -578,11 +542,11 @@ Building the program with BLAS support may lead to some performance improvements
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
| Option | Legal values | Default | Description |
|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| Option | Legal values | Default | Description |
|-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- #### Vulkan
@@ -592,7 +556,7 @@ Building the program with BLAS support may lead to some performance improvements
```sh
# Build the image
docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
# Then, use it:
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
@@ -613,17 +577,15 @@ Building the program with BLAS support may lead to some performance improvements
vulkaninfo
```
Alternatively your package manager might be able to provide the appropriate libraries.
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
Then, build llama.cpp using the cmake command below:
```bash
cmake -B build -DGGML_VULKAN=1
cmake -B build -DLLAMA_VULKAN=1
cmake --build build --config Release
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
# You should see in the output, ggml_vulkan detected your GPU. For example:
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
@@ -636,8 +598,8 @@ Building the program with BLAS support may lead to some performance improvements
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
```bash
# obtain the official LLaMA model weights and place them in ./models
@@ -654,20 +616,23 @@ ls ./models
python3 -m pip install -r requirements.txt
# convert the model to ggml FP16 format
python3 convert_hf_to_gguf.py models/mymodel/
python3 convert-hf-to-gguf.py models/mymodel/
# [Optional] for models using BPE tokenizers
python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
# quantize the model to 4-bits (using Q4_K_M method)
./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
# update the gguf filetype to current version if older version is now unsupported
./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
```
### Run the quantized model
```bash
# start inference on a gguf model
./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
```
When running the larger models, make sure you have enough disk space to store all the intermediate files.
@@ -742,7 +707,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread
#### How to run
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
2. Run `./llama-perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
3. Output:
```
perplexity : calculating perplexity over 655 chunks
@@ -766,16 +731,16 @@ Here is an example of a few-shot interaction, invoked with the command
./examples/chat-13B.sh
# custom arguments using a 13B model
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
```
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
### Persistent Interaction
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
```bash
# Start a new chat
@@ -797,13 +762,41 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
```bash
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
```
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
### Instruct mode
1. First, download and place the `ggml` model into the `./models` folder
2. Run the `main` tool like this:
```
./examples/alpaca.sh
```
Sample run:
```
== Running in interactive mode. ==
- Press Ctrl+C to interject at any time.
- Press Return to return control to LLaMA.
- If you want to submit another line, end your input in '\'.
Below is an instruction that describes a task. Write a response that appropriately completes the request.
> How many letters are there in the English alphabet?
There 26 letters in the English Alphabet
> What is the most common way of transportation in Amsterdam?
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
> List 5 words that start with "ca".
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
>
```
### Obtaining and using the Facebook LLaMA 2 model
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
@@ -876,7 +869,7 @@ $mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/ho
Now, you can start chatting:
```
$cd /data/data/com.termux/files/home/bin
$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
```
Here's a demo of an interactive session running on Pixel 5 phone:
@@ -943,8 +936,8 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
```bash
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
```
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@@ -976,14 +969,25 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
- Collaborators will be invited based on contributions
- Any help with managing issues and PRs is very appreciated!
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
### Coding guidelines
- Avoid adding third-party dependencies, extra files, extra headers, etc.
- Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
![matmul](media/matmul.png)
### Docs
- [main (cli)](./examples/main/README.md)
- [main](./examples/main/README.md)
- [server](./examples/server/README.md)
- [jeopardy](./examples/jeopardy/README.md)
- [BLIS](./docs/BLIS.md)

242
ci/run.sh
View File

@@ -36,11 +36,11 @@ SRC=`pwd`
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
fi
if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -50,7 +50,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
exit 1
fi
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
fi
## helpers
@@ -284,10 +284,10 @@ function gg_run_open_llama_7b_v2 {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -303,47 +303,47 @@ function gg_run_open_llama_7b_v2 {
wiki_test="${path_wiki}/wiki.test.raw"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
@@ -421,7 +421,7 @@ function gg_run_pythia_1_4b {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -437,45 +437,45 @@ function gg_run_pythia_1_4b {
wiki_test_60="${path_wiki}/wiki.test-60.raw"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
@@ -550,10 +550,10 @@ function gg_run_pythia_2_8b {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -569,47 +569,47 @@ function gg_run_pythia_2_8b {
wiki_test="${path_wiki}/wiki.test.raw"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
@@ -688,15 +688,15 @@ function gg_run_embd_bge_small {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q8_0} q8_0
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
set +e
}

View File

@@ -79,22 +79,22 @@ endmacro()
# flags are for MSVC only!
check_sse("AVX" " ;/arch:AVX")
if (NOT ${AVX_FOUND})
set(GGML_AVX OFF)
set(LLAMA_AVX OFF)
else()
set(GGML_AVX ON)
set(LLAMA_AVX ON)
endif()
check_sse("AVX2" " ;/arch:AVX2")
check_sse("FMA" " ;/arch:AVX2")
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
set(GGML_AVX2 OFF)
set(LLAMA_AVX2 OFF)
else()
set(GGML_AVX2 ON)
set(LLAMA_AVX2 ON)
endif()
check_sse("AVX512" " ;/arch:AVX512")
if (NOT ${AVX512_FOUND})
set(GGML_AVX512 OFF)
set(LLAMA_AVX512 OFF)
else()
set(GGML_AVX512 ON)
set(LLAMA_AVX512 ON)
endif()

View File

@@ -1,22 +0,0 @@
find_package(Git)
# the commit's SHA1
execute_process(COMMAND
"${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
OUTPUT_VARIABLE GIT_SHA1
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
# the date of the commit
execute_process(COMMAND
"${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
OUTPUT_VARIABLE GIT_DATE
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
# the subject of the commit
execute_process(COMMAND
"${GIT_EXECUTABLE}" log -1 --format=%s
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)

14
codecov.yml Normal file
View File

@@ -0,0 +1,14 @@
comment: off
coverage:
status:
project:
default:
target: auto
threshold: 0
base: auto
patch:
default:
target: auto
threshold: 0
base: auto

View File

@@ -1,6 +1,5 @@
# common
find_package(Threads REQUIRED)
# Build info header
#
@@ -37,7 +36,7 @@ add_custom_command(
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
VERBATIM
@@ -84,5 +83,5 @@ if (LLAMA_CURL)
endif ()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features (${TARGET} PUBLIC cxx_std_11)
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)

File diff suppressed because it is too large Load Diff

View File

@@ -52,12 +52,6 @@ int32_t cpu_get_num_math();
// CLI argument parsing
//
// dimensionality reduction methods, used by cvector-generator
enum dimre_method {
DIMRE_METHOD_PCA,
DIMRE_METHOD_MEAN,
};
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
@@ -79,6 +73,7 @@ struct gpt_params {
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width.
int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
@@ -99,7 +94,6 @@ struct gpt_params {
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
// // sampling parameters
struct llama_sampling_params sparams;
@@ -159,6 +153,7 @@ struct gpt_params {
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
bool embedding = false; // get only sentence embedding
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
@@ -185,12 +180,6 @@ struct gpt_params {
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
// embedding
bool embedding = false; // get only sentence embedding
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
std::string embd_sep = "\n"; // separator of embendings
// server params
int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds
@@ -201,7 +190,6 @@ struct gpt_params {
std::string public_path = "";
std::string chat_template = "";
std::string system_prompt = "";
bool enable_chat_template = true;
std::vector<std::string> api_keys;
@@ -215,8 +203,6 @@ struct gpt_params {
std::string slot_save_path;
float slot_prompt_similarity = 0.5f;
// batched-bench params
bool is_pp_shared = false;
@@ -244,16 +230,6 @@ struct gpt_params {
bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity
// cvector-generator params
int n_pca_batch = 100;
int n_pca_iterations = 1000;
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
std::string cvector_outfile = "control_vector.gguf";
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
bool spm_infill = false; // suffix/prefix/middle pattern for infill
};
void gpt_params_handle_model_default(gpt_params & params);
@@ -299,7 +275,6 @@ bool fs_validate_filename(const std::string & filename);
bool fs_create_directory_with_parents(const std::string & path);
std::string fs_get_cache_directory();
std::string fs_get_cache_file(const std::string & filename);
//
// Model utils
@@ -374,34 +349,9 @@ bool llama_should_add_bos_token(const llama_model * model);
// Chat template utils
//
// same with llama_chat_message, but uses std::string
struct llama_chat_msg {
std::string role;
std::string content;
};
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool llama_chat_verify_template(const std::string & tmpl);
// CPP wrapper for llama_chat_apply_template
// If the built-in template is not supported, we default to chatml
// If the custom "tmpl" is not supported, we throw an error
std::string llama_chat_apply_template(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & chat,
bool add_ass);
// Format single message, while taking into account the position of that message in chat history
std::string llama_chat_format_single(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & past_msg,
const llama_chat_msg & new_msg,
bool add_ass);
// Returns an example of formatted chat
std::string llama_chat_format_example(const struct llama_model * model,
const std::string & tmpl);
//
// KV cache utils
//
@@ -416,7 +366,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
// Embedding utils
//
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
void llama_embd_normalize(const float * inp, float * out, int n);
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
@@ -460,3 +410,4 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
void yaml_dump_non_result_info(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

View File

@@ -40,234 +40,7 @@ static std::string build_repetition(const std::string & item_rule, int min_items
return result;
}
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
class string_view {
const std::string & _str;
const size_t _start;
const size_t _end;
public:
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
size_t size() const {
return _end - _start;
}
size_t length() const {
return size();
}
operator std::string() const {
return str();
}
std::string str() const {
return _str.substr(_start, _end - _start);
}
string_view substr(size_t pos, size_t len = std::string::npos) const {
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
}
char operator[](size_t pos) const {
auto index = _start + pos;
if (index >= _end) {
throw std::out_of_range("string_view index out of range");
}
return _str[_start + pos];
}
bool operator==(const string_view & other) const {
std::string this_str = *this;
std::string other_str = other;
return this_str == other_str;
}
};
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
auto has_min = min_value != std::numeric_limits<int>::min();
auto has_max = max_value != std::numeric_limits<int>::max();
auto digit_range = [&](char from, char to) {
out << "[";
if (from == to) {
out << from;
} else {
out << from << "-" << to;
}
out << "]";
};
auto more_digits = [&](int min_digits, int max_digits) {
out << "[0-9]";
if (min_digits == max_digits && min_digits == 1) {
return;
}
out << "{";
out << min_digits;
if (max_digits != min_digits) {
out << ",";
if (max_digits != std::numeric_limits<int>::max()) {
out << max_digits;
}
}
out << "}";
};
std::function<void(const string_view &, const string_view &)> uniform_range =
[&](const string_view & from, const string_view & to) {
size_t i = 0;
while (i < from.length() && i < to.length() && from[i] == to[i]) {
i++;
}
if (i > 0) {
out << "\"" << from.substr(0, i).str() << "\"";
}
if (i < from.length() && i < to.length()) {
if (i > 0) {
out << " ";
}
auto sub_len = from.length() - i - 1;
if (sub_len > 0) {
auto from_sub = from.substr(i + 1);
auto to_sub = to.substr(i + 1);
auto sub_zeros = repeat("0", sub_len);
auto sub_nines = repeat("9", sub_len);
auto to_reached = false;
out << "(";
if (from_sub == sub_zeros) {
digit_range(from[i], to[i] - 1);
out << " ";
more_digits(sub_len, sub_len);
} else {
out << "[" << from[i] << "] ";
out << "(";
uniform_range(from_sub, sub_nines);
out << ")";
if (from[i] < to[i] - 1) {
out << " | ";
if (to_sub == sub_nines) {
digit_range(from[i] + 1, to[i]);
to_reached = true;
} else {
digit_range(from[i] + 1, to[i] - 1);
}
out << " ";
more_digits(sub_len, sub_len);
}
}
if (!to_reached) {
out << " | ";
digit_range(to[i], to[i]);
out << " ";
uniform_range(sub_zeros, to_sub);
}
out << ")";
} else {
out << "[" << from[i] << "-" << to[i] << "]";
}
}
};
if (has_min && has_max) {
if (min_value < 0 && max_value < 0) {
out << "\"-\" (";
_build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
out << ")";
return;
}
if (min_value < 0) {
out << "\"-\" (";
_build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
out << ") | ";
min_value = 0;
}
auto min_s = std::to_string(min_value);
auto max_s = std::to_string(max_value);
auto min_digits = min_s.length();
auto max_digits = max_s.length();
for (auto digits = min_digits; digits < max_digits; digits++) {
uniform_range(min_s, repeat("9", digits));
min_s = "1" + repeat("0", digits);
out << " | ";
}
uniform_range(min_s, max_s);
return;
}
auto less_decimals = std::max(decimals_left - 1, 1);
if (has_min) {
if (min_value < 0) {
out << "\"-\" (";
_build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
out << ") | [0] | [1-9] ";
more_digits(0, decimals_left - 1);
} else if (min_value == 0) {
if (top_level) {
out << "[0] | [1-9] ";
more_digits(0, less_decimals);
} else {
more_digits(1, decimals_left);
}
} else if (min_value <= 9) {
char c = '0' + min_value;
auto range_start = top_level ? '1' : '0';
if (c > range_start) {
digit_range(range_start, c - 1);
out << " ";
more_digits(1, less_decimals);
out << " | ";
}
digit_range(c, '9');
out << " ";
more_digits(0, less_decimals);
} else {
auto min_s = std::to_string(min_value);
auto len = min_s.length();
auto c = min_s[0];
if (c > '1') {
digit_range(top_level ? '1' : '0', c - 1);
out << " ";
more_digits(len, less_decimals);
out << " | ";
}
digit_range(c, c);
out << " (";
_build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
out << ")";
if (c < '9') {
out << " | ";
digit_range(c + 1, '9');
out << " ";
more_digits(len - 1, less_decimals);
}
}
return;
}
if (has_max) {
if (max_value >= 0) {
if (top_level) {
out << "\"-\" [1-9] ";
more_digits(0, less_decimals);
out << " | ";
}
_build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
} else {
out << "\"-\" (";
_build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
out << ")";
}
return;
}
throw std::runtime_error("At least one of min_value or max_value must be set");
}
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
const std::string SPACE_RULE = "\" \"?";
struct BuiltinRule {
std::string content;
@@ -284,7 +57,7 @@ std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
{"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
{"null", {"\"null\" space", {}}},
};
@@ -316,7 +89,7 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
};
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
template <typename Iterator>
std::string join(Iterator begin, Iterator end, const std::string & separator) {
@@ -387,6 +160,7 @@ static std::string format_literal(const std::string & literal) {
return "\"" + escaped + "\"";
}
class SchemaConverter {
private:
std::function<json(const std::string &)> _fetch_json;
@@ -614,75 +388,6 @@ private:
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
}
/*
Returns a rule that matches a JSON string that is none of the provided strings
not_strings({"a"})
-> ["] ( [a] char+ | [^"a] char* )? ["] space
not_strings({"and", "also"})
-> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
*/
std::string _not_strings(const std::vector<std::string> & strings) {
struct TrieNode {
std::map<char, TrieNode> children;
bool is_end_of_string;
TrieNode() : is_end_of_string(false) {}
void insert(const std::string & string) {
auto node = this;
for (char c : string) {
node = &node->children[c];
}
node->is_end_of_string = true;
}
};
TrieNode trie;
for (const auto & s : strings) {
trie.insert(s);
}
std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
std::ostringstream out;
out << "[\"] ( ";
std::function<void(const TrieNode &)> visit = [&](const TrieNode & node) {
std::ostringstream rejects;
auto first = true;
for (const auto & kv : node.children) {
rejects << kv.first;
if (first) {
first = false;
} else {
out << " | ";
}
out << "[" << kv.first << "]";
if (!kv.second.children.empty()) {
out << " (";
visit(kv.second);
out << ")";
} else if (kv.second.is_end_of_string) {
out << " " << char_rule << "+";
}
}
if (!node.children.empty()) {
if (!first) {
out << " | ";
}
out << "[^\"" << rejects.str() << "] " << char_rule << "*";
}
};
visit(trie);
out << " )";
if (!trie.is_end_of_string) {
out << "?";
}
out << " [\"] space";
return out.str();
}
std::string _resolve_ref(const std::string & ref) {
std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
@@ -703,7 +408,6 @@ private:
std::vector<std::string> required_props;
std::vector<std::string> optional_props;
std::unordered_map<std::string, std::string> prop_kv_rule_names;
std::vector<std::string> prop_names;
for (const auto & kv : properties) {
const auto &prop_name = kv.first;
const auto &prop_schema = kv.second;
@@ -718,18 +422,11 @@ private:
} else {
optional_props.push_back(prop_name);
}
prop_names.push_back(prop_name);
}
if ((additional_properties.is_boolean() && additional_properties.get<bool>()) || additional_properties.is_object()) {
if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
std::string value_rule =
additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
: _add_primitive("value", PRIMITIVE_RULES.at("value"));
auto key_rule =
prop_names.empty() ? _add_primitive("string", PRIMITIVE_RULES.at("string"))
: _add_rule(sub_name + "-k", _not_strings(prop_names));
std::string kv_rule = _add_rule(sub_name + "-kv", key_rule + " \":\" space " + value_rule);
std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
prop_kv_rule_names["*"] = kv_rule;
optional_props.push_back("*");
}
@@ -755,11 +452,15 @@ private:
}
std::string k = ks[0];
std::string kv_rule_name = prop_kv_rule_names[k];
std::string comma_ref = "( \",\" space " + kv_rule_name + " )";
if (first_is_optional) {
res = comma_ref + (k == "*" ? "*" : "?");
if (k == "*") {
res = _add_rule(
name + (name.empty() ? "" : "-") + "additional-kvs",
kv_rule_name + " ( \",\" space " + kv_rule_name + " )*"
);
} else if (first_is_optional) {
res = "( \",\" space " + kv_rule_name + " )?";
} else {
res = kv_rule_name + (k == "*" ? " " + comma_ref + "*" : "");
res = kv_rule_name;
}
if (ks.size() > 1) {
res += " " + _add_rule(
@@ -893,19 +594,17 @@ public:
} else if (schema_type.is_array()) {
std::vector<json> schema_types;
for (const auto & t : schema_type) {
json schema_copy(schema);
schema_copy["type"] = t;
schema_types.push_back(schema_copy);
schema_types.push_back({{"type", t}});
}
return _add_rule(rule_name, _generate_union_rule(name, schema_types));
} else if (schema.contains("const")) {
return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
} else if (schema.contains("enum")) {
std::vector<std::string> enum_values;
for (const auto & v : schema["enum"]) {
enum_values.push_back(_generate_constant_rule(v));
}
return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
} else if ((schema_type.is_null() || schema_type == "object")
&& (schema.contains("properties") ||
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@@ -987,24 +686,6 @@ public:
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
} else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
int min_value = std::numeric_limits<int>::min();
int max_value = std::numeric_limits<int>::max();
if (schema.contains("minimum")) {
min_value = schema["minimum"].get<int>();
} else if (schema.contains("exclusiveMinimum")) {
min_value = schema["exclusiveMinimum"].get<int>() + 1;
}
if (schema.contains("maximum")) {
max_value = schema["maximum"].get<int>();
} else if (schema.contains("exclusiveMaximum")) {
max_value = schema["exclusiveMaximum"].get<int>() - 1;
}
std::stringstream out;
out << "(";
_build_min_max_int(min_value, max_value, out);
out << ") space";
return _add_rule(rule_name, out.str());
} else if (schema.empty() || schema_type == "object") {
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
} else {

View File

@@ -28,13 +28,9 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
struct llama_grammar * grammar = llama_grammar_init(
result->grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
if (grammar == nullptr) {
throw std::runtime_error("Failed to initialize llama_grammar");
}
result->grammar = grammar;
}
result->prev.resize(params.n_prev);
@@ -63,13 +59,9 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
if (!ctx->parsed_grammar.rules.empty()) {
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
struct llama_grammar * grammar = llama_grammar_init(
ctx->grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
if (grammar == nullptr) {
throw std::runtime_error("Failed to initialize llama_grammar");
}
ctx->grammar = grammar;
}
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);

View File

@@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
# This script downloads the tokenizer models of the specified models from Huggingface and
# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
#
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
# provide the necessary information to llama.cpp via the GGUF header in order to implement
@@ -15,9 +15,9 @@
# - Add a new model to the "models" list
# - Run the script with your huggingface token:
#
# python3 convert_hf_to_gguf_update.py <huggingface_token>
# python3 convert-hf-to-gguf-update.py <huggingface_token>
#
# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
# - Update llama.cpp with the new pre-tokenizer if necessary
#
# TODO: generate tokenizer tests for llama.cpp
@@ -37,7 +37,7 @@ from enum import IntEnum, auto
from transformers import AutoTokenizer
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("convert_hf_to_gguf_update")
logger = logging.getLogger("convert-hf-to-gguf-update")
sess = requests.Session()
@@ -45,7 +45,6 @@ class TOKENIZER_TYPE(IntEnum):
SPM = auto()
BPE = auto()
WPM = auto()
UGM = auto()
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
@@ -56,10 +55,10 @@ if len(sys.argv) == 2:
token = sys.argv[1]
if not token.startswith("hf_"):
logger.info("Huggingface token seems invalid")
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
sys.exit(1)
else:
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
sys.exit(1)
# TODO: add models here, base models preferred
@@ -84,13 +83,7 @@ models = [
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
{"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
]
@@ -112,13 +105,9 @@ def download_model(model):
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
if tokt == TOKENIZER_TYPE.SPM:
files.append("tokenizer.model")
if tokt == TOKENIZER_TYPE.UGM:
files.append("spiece.model")
for file in files:
save_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(save_path):
@@ -134,14 +123,14 @@ for model in models:
logger.error(f"Failed to download model {model['name']}. Error: {e}")
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
src_ifs = ""
for model in models:
name = model["name"]
tokt = model["tokt"]
if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
if tokt == TOKENIZER_TYPE.SPM:
continue
# Skip if the tokenizer folder does not exist or there are other download issues previously
@@ -151,10 +140,7 @@ for model in models:
# create the tokenizer
try:
if name == "t5":
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
else:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except OSError as e:
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded
@@ -201,7 +187,7 @@ src_func = f"""
res = None
# NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
# NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
# or pull the latest version of the model from Huggingface
# don't edit the hashes manually!
{src_ifs}
@@ -210,9 +196,9 @@ src_func = f"""
logger.warning("**************************************************************************************")
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
logger.warning("** There are 2 possible reasons for this:")
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
logger.warning("** - the pre-tokenization config has changed upstream")
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
logger.warning("**")
logger.warning(f"** chkhsh: {{chkhsh}}")
@@ -226,8 +212,8 @@ src_func = f"""
return res
"""
convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
convert_py = convert_py_pth.read_text(encoding="utf-8")
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
convert_py = convert_py_pth.read_text()
convert_py = re.sub(
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
lambda m: m.group(1) + src_func + m.group(3),
@@ -235,9 +221,9 @@ convert_py = re.sub(
flags=re.DOTALL | re.MULTILINE,
)
convert_py_pth.write_text(convert_py, encoding="utf-8")
convert_py_pth.write_text(convert_py)
logger.info("+++ convert_hf_to_gguf.py was updated")
logger.info("+++ convert-hf-to-gguf.py was updated")
# generate tests for each tokenizer model
@@ -275,7 +261,6 @@ tests = [
"\n =",
"' era",
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天",
"!!!!!!",
"3",
"33",
"333",
@@ -285,8 +270,7 @@ tests = [
"3333333",
"33333333",
"333333333",
"Cửa Việt", # llama-bpe fails on this
" discards",
# "Cửa Việt", # llama-bpe fails on this
chktxt,
]
@@ -314,10 +298,7 @@ for model in models:
# create the tokenizer
try:
if name == "t5":
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
else:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except OSError as e:
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
continue # Skip this model and continue with the next one in the loop
@@ -343,6 +324,6 @@ logger.info("\nRun the following commands to generate the vocab files for testin
for model in models:
name = model["name"]
print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
logger.info("\n")

View File

@@ -13,7 +13,7 @@ import sys
from enum import IntEnum
from pathlib import Path
from hashlib import sha256
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
import math
import numpy as np
@@ -47,12 +47,11 @@ class Model:
_model_classes: dict[str, type[Model]] = {}
dir_model: Path
ftype: gguf.LlamaFileType
ftype: int
is_big_endian: bool
endianess: gguf.GGUFEndian
use_temp_file: bool
lazy: bool
model_name: str | None
part_names: list[str]
is_safetensors: bool
hparams: dict[str, Any]
@@ -65,8 +64,7 @@ class Model:
# subclasses should define this!
model_arch: gguf.MODEL_ARCH
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool,
model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
if type(self) is Model:
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
self.dir_model = dir_model
@@ -75,13 +73,12 @@ class Model:
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
self.use_temp_file = use_temp_file
self.lazy = not eager
self.model_name = model_name
self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
self.is_safetensors = len(self.part_names) > 0
if not self.is_safetensors:
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
self.hparams = Model.load_hparams(self.dir_model)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
self.tensor_names = None
if self.ftype == gguf.LlamaFileType.GUESSED:
@@ -97,8 +94,7 @@ class Model:
ftype_lw: str = ftype_up.lower()
# allow templating the file name with the output ftype, useful with the "auto" ftype
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
@classmethod
def __init_subclass__(cls):
@@ -186,7 +182,7 @@ class Model:
return new_name
def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_block_count(self.block_count)
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
@@ -328,23 +324,21 @@ class Model:
def write(self):
self.write_tensors()
self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_header_to_file()
self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.write_tensors_to_file(progress=True)
self.gguf_writer.close()
def write_vocab(self):
if len(self.gguf_writer.tensors) != 1:
raise ValueError('Splitting the vocabulary is not supported')
self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_header_to_file()
self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.close()
@staticmethod
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
part_names: list[str] = []
for filename in os.listdir(dir_model):
if filename.startswith(prefix) and filename.endswith(suffix):
if filename.endswith(suffix):
part_names.append(filename)
part_names.sort()
@@ -404,7 +398,7 @@ class Model:
return tokens, toktypes, tokpre
# NOTE: this function is generated by convert_hf_to_gguf_update.py
# NOTE: this function is generated by convert-hf-to-gguf-update.py
# do not modify it manually!
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
# Marker: Start get_vocab_base_pre
@@ -424,7 +418,7 @@ class Model:
res = None
# NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
# NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
# or pull the latest version of the model from Huggingface
# don't edit the hashes manually!
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
@@ -481,27 +475,18 @@ class Model:
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
res = "smaug-bpe"
if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
# ref: https://huggingface.co/LumiOpen/Poro-34B-chat
res = "poro-chat"
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
res = "jina-v2-code"
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
# ref: https://huggingface.co/LumiOpen/Viking-7B
res = "viking"
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
# ref: https://huggingface.co/core42/jais-13b
res = "jais"
if res is None:
logger.warning("\n")
logger.warning("**************************************************************************************")
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
logger.warning("** There are 2 possible reasons for this:")
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
logger.warning("** - the pre-tokenization config has changed upstream")
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
logger.warning("**")
logger.warning(f"** chkhsh: {chkhsh}")
@@ -579,19 +564,7 @@ class Model:
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_sentencepiece(self, add_to_gguf=True):
tokens, scores, toktypes = self._create_vocab_sentencepiece()
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
def _create_vocab_sentencepiece(self):
def _set_vocab_sentencepiece(self):
from sentencepiece import SentencePieceProcessor
tokenizer_path = self.dir_model / 'tokenizer.model'
@@ -653,7 +626,14 @@ class Model:
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED)
return tokens, scores, toktypes
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_llama_hf(self):
vocab = gguf.LlamaHfVocab(self.dir_model)
@@ -677,51 +657,6 @@ class Model:
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
default_pre = "mpt" if model_name == "gpt-neox" else "default"
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL)
assert field # tokenizer model
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8"))
field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE)
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre)
field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST)
assert field # token list
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
if model_name == "llama-spm":
field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
assert field # token scores
self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
assert field # token types
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
if model_name != "llama-spm":
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
assert field # token merges
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None:
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None:
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None:
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None:
self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0])
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None:
self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0])
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
@Model.register("GPTNeoXForCausalLM")
class GPTNeoXModel(Model):
@@ -730,7 +665,7 @@ class GPTNeoXModel(Model):
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
@@ -863,7 +798,7 @@ class MPTModel(Model):
def set_gguf_parameters(self):
block_count = self.hparams["n_layers"]
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
self.gguf_writer.add_block_count(block_count)
@@ -915,7 +850,7 @@ class OrionModel(Model):
raise ValueError("gguf: can not find ctx length parameter.")
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length)
@@ -952,7 +887,7 @@ class BaichuanModel(Model):
else:
raise ValueError("gguf: can not find ctx length parameter.")
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length)
@@ -1027,11 +962,7 @@ class XverseModel(Model):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
# Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
# because vocab_size is the count of items, and indexes start at 0.
max_vocab_index = max(tokenizer.get_vocab().values())
if max_vocab_index >= vocab_size:
raise ValueError("Vocabulary size exceeds expected maximum size.")
assert max(tokenizer.vocab.values()) < vocab_size
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab()
@@ -1079,7 +1010,7 @@ class XverseModel(Model):
else:
raise ValueError("gguf: can not find ctx length parameter.")
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length)
@@ -1275,7 +1206,7 @@ class StableLMModel(Model):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
@@ -1464,48 +1395,6 @@ class LlamaModel(Model):
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("BitnetForCausalLM")
class BitnetModel(Model):
model_arch = gguf.MODEL_ARCH.BITNET
def set_vocab(self):
self._set_vocab_sentencepiece()
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(1.0)
def weight_quant(self, weight):
dtype = weight.dtype
weight = weight.float()
s = 1 / weight.abs().mean().clamp(min=1e-5)
weight = (weight * s).round().clamp(-1, 1) / s
scale = weight.abs().max().unsqueeze(0)
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
weight = torch.sign(weight).type(dtype)
return weight.type(dtype), scale.type(torch.float32)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
new_name = self.map_tensor_name(name)
if any(self.match_model_tensor_name(new_name, key, bid) for key in [
gguf.MODEL_TENSOR.ATTN_Q,
gguf.MODEL_TENSOR.ATTN_K,
gguf.MODEL_TENSOR.ATTN_V,
gguf.MODEL_TENSOR.ATTN_OUT,
gguf.MODEL_TENSOR.FFN_UP,
gguf.MODEL_TENSOR.FFN_DOWN,
gguf.MODEL_TENSOR.FFN_GATE,
]):
# transform weight into 1/0/-1 (in fp32)
weight_torch, scale_torch = self.weight_quant(data_torch)
yield (new_name, weight_torch)
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
else:
yield (new_name, data_torch)
@Model.register("GrokForCausalLM")
class GrokModel(Model):
model_arch = gguf.MODEL_ARCH.GROK
@@ -1738,12 +1627,6 @@ class Qwen2MoeModel(Model):
super().set_gguf_parameters()
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
_experts: list[dict[str, Tensor]] | None = None
@@ -1798,7 +1681,7 @@ class GPT2Model(Model):
model_arch = gguf.MODEL_ARCH.GPT2
def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_block_count(self.hparams["n_layer"])
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@@ -1987,7 +1870,7 @@ class Phi3MiniModel(Model):
if len(rope_scaling_type) == 0:
raise KeyError('Missing the required key rope_scaling.type')
if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
if rope_scaling_type == 'su':
attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
elif rope_scaling_type == 'yarn':
attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
@@ -2361,13 +2244,11 @@ class GemmaModel(Model):
special_vocab._set_special_token("eot", 107)
special_vocab.add_to_gguf(self.gguf_writer)
self.gguf_writer.add_add_space_prefix(False)
def set_gguf_parameters(self):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
@@ -2395,71 +2276,6 @@ class GemmaModel(Model):
return [(self.map_tensor_name(name), data_torch)]
@Model.register("Gemma2ForCausalLM")
class Gemma2Model(Model):
model_arch = gguf.MODEL_ARCH.GEMMA2
def set_vocab(self):
tokens, scores, toktypes = self._create_vocab_sentencepiece()
# hack: This is required so that we can properly use start/end-of-turn for chat template
for i in range(108):
# including <unusedX>, <start_of_turn>, <end_of_turn>
toktypes[i] = SentencePieceTokenTypes.CONTROL
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
self.gguf_writer.add_add_space_prefix(False)
def set_gguf_parameters(self):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_key_length(hparams["head_dim"])
self.gguf_writer.add_value_length(hparams["head_dim"])
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_attn_logit_softcapping(
self.hparams["attn_logit_softcapping"]
)
self.gguf_writer.add_final_logit_softcapping(
self.hparams["final_logit_softcapping"]
)
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
# sanity check
attn_scalar = self.hparams["query_pre_attn_scalar"]
if attn_scalar != hparams["hidden_size"] / hparams["num_attention_heads"]:
raise ValueError("query_pre_attn_scalar must be equal to n_embd / n_head")
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unusem
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
# To prevent errors, skip loading lm_head.weight.
if name == "lm_head.weight":
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
return []
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
if name.endswith("norm.weight"):
data_torch = data_torch + 1
return [(self.map_tensor_name(name), data_torch)]
@Model.register("Starcoder2ForCausalLM")
class StarCoder2Model(Model):
model_arch = gguf.MODEL_ARCH.STARCODER2
@@ -2484,7 +2300,39 @@ class MambaModel(Model):
self._set_vocab_sentencepiece()
else:
# Use the GPT-NeoX tokenizer when no tokenizer files are present
self._set_vocab_builtin("gpt-neox", vocab_size)
tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
neox_reader = gguf.GGUFReader(tokenizer_path, "r")
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2")
field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt")
field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
assert field
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
assert field
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
assert field
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1)
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0)
field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0)
field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)
self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0)
def set_gguf_parameters(self):
d_model = self.find_hparam(["hidden_size", "d_model"])
@@ -2500,7 +2348,7 @@ class MambaModel(Model):
# Fail early for models which don't have a block expansion factor of 2
assert d_inner == 2 * d_model
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
self.gguf_writer.add_embedding_length(d_model)
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
@@ -2636,82 +2484,6 @@ class JinaBertV2Model(BertModel):
self.gguf_writer.add_add_eos_token(True)
@Model.register("OpenELMForCausalLM")
class OpenELMModel(Model):
model_arch = gguf.MODEL_ARCH.OPENELM
@staticmethod
def _make_divisible(v: float | int, divisor: int) -> int:
# ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38
new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
# Make sure that round down does not go down by more than 10%.
if new_v < 0.9 * v:
new_v += divisor
return new_v
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
ffn_multipliers: list[float] = self.hparams["ffn_multipliers"]
ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"]
self._n_embd: int = self.hparams["model_dim"]
self._num_kv_heads: list[int] = self.hparams["num_kv_heads"]
self._num_query_heads: list[int] = self.hparams["num_query_heads"]
self._ffn_dims: list[int] = [
OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor)
for multiplier in ffn_multipliers
]
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
# Uses the tokenizer from meta-llama/Llama-2-7b-hf
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
except FileNotFoundError:
self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
def set_gguf_parameters(self):
n_embd = self._n_embd
head_dim = self.hparams["head_dim"]
rot_pct = 1.0
assert self.block_count == len(self._num_kv_heads)
assert self.block_count == len(self._num_query_heads)
assert self.block_count == len(self._ffn_dims)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_context_length(self.hparams["max_context_length"])
self.gguf_writer.add_embedding_length(n_embd)
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
self.gguf_writer.add_head_count(self._num_query_heads)
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"])
# https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
self.gguf_writer.add_layer_norm_rms_eps(1e-6)
self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim))
self.gguf_writer.add_key_length(head_dim)
self.gguf_writer.add_value_length(head_dim)
self.gguf_writer.add_file_type(self.ftype)
def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
if "n_layers" in keys:
return self.hparams["num_transformer_layers"]
return super().find_hparam(keys, optional)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# split ff
if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight":
ff_dim = self._ffn_dims[bid]
yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])
yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])
return
yield (self.map_tensor_name(name), data_torch)
@Model.register("ArcticForCausalLM")
class ArcticModel(Model):
model_arch = gguf.MODEL_ARCH.ARCTIC
@@ -2942,240 +2714,6 @@ class DeepseekV2Model(Model):
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("T5WithLMHeadModel")
@Model.register("T5ForConditionalGeneration")
@Model.register("MT5ForConditionalGeneration")
@Model.register("UMT5ForConditionalGeneration")
class T5Model(Model):
model_arch = gguf.MODEL_ARCH.T5
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.shared_token_embeddings_found = False
def set_vocab(self):
# to avoid TypeError: Descriptors cannot be created directly
# exception when importing sentencepiece_model_pb2
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
from sentencepiece import SentencePieceProcessor
from sentencepiece import sentencepiece_model_pb2 as model
tokenizer_path = self.dir_model / 'tokenizer.model'
# many older models use spiece.model tokenizer model filename
if not tokenizer_path.is_file():
tokenizer_path = self.dir_model / 'spiece.model'
if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}")
sentencepiece_model = model.ModelProto()
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
# some models like Pile-T5 family use BPE tokenizer instead of Unigram
if sentencepiece_model.trainer_spec.model_type == 2: # BPE
# assure the tokenizer model file name is correct
assert tokenizer_path.name == 'tokenizer.model'
return self._set_vocab_sentencepiece()
else:
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
tokenizer = SentencePieceProcessor()
tokenizer.LoadFromFile(str(tokenizer_path))
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
for token_id in range(tokenizer.vocab_size()):
piece = tokenizer.IdToPiece(token_id)
text = piece.encode("utf-8")
score = tokenizer.GetScore(token_id)
toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.IsUnknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.IsControl(token_id):
toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.IsUnused(token_id):
toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.IsByte(token_id):
toktype = SentencePieceTokenTypes.BYTE
tokens[token_id] = text
scores[token_id] = score
toktypes[token_id] = toktype
added_tokens_file = self.dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
added_tokens_json = json.load(f)
for key in added_tokens_json:
token_id = added_tokens_json[key]
if (token_id >= vocab_size):
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
tokens[token_id] = key.encode("utf-8")
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if vocab_size > len(tokens):
pad_count = vocab_size - len(tokens)
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
for i in range(1, pad_count + 1):
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED)
self.gguf_writer.add_tokenizer_model("t5")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_add_space_prefix(add_prefix)
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
if precompiled_charsmap:
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
self.gguf_writer.add_add_bos_token(False)
self.gguf_writer.add_add_eos_token(True)
def set_gguf_parameters(self):
self.gguf_writer.add_name("T5")
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
n_ctx = 512
self.gguf_writer.add_context_length(n_ctx)
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
self.gguf_writer.add_block_count(self.hparams["num_layers"])
self.gguf_writer.add_head_count(self.hparams["num_heads"])
self.gguf_writer.add_key_length(self.hparams["d_kv"])
self.gguf_writer.add_value_length(self.hparams["d_kv"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
self.gguf_writer.add_file_type(self.ftype)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
# T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
# "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
# in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
# and decoder and ignore the remaining ones.
if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
if not self.shared_token_embeddings_found:
name = "shared.weight"
self.shared_token_embeddings_found = True
else:
logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
return []
return [(self.map_tensor_name(name), data_torch)]
@Model.register("JAISLMHeadModel")
class JaisModel(Model):
model_arch = gguf.MODEL_ARCH.JAIS
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# SwigLU activation
assert self.hparams["activation_function"] == "swiglu"
# ALiBi position embedding
assert self.hparams["position_embedding_type"] == "alibi"
# Embeddings scale
self.embeddings_scale = 1.0
# note: For some JAIS flavors, output is tied to (same as) wte in original model
self.output_is_wte = False
if 'mup_embeddings_scale' in self.hparams:
self.output_is_wte = True # Hack (?)
self.embeddings_scale = self.hparams['mup_embeddings_scale']
elif 'embeddings_scale' in self.hparams:
self.embeddings_scale = self.hparams['embeddings_scale']
else:
assert False
self.width_scale = 1.0
if 'mup_output_alpha' in self.hparams:
assert 'mup_width_scale' in self.hparams
self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
elif 'width_scale' in self.hparams:
self.width_scale = self.hparams['width_scale']
else:
assert False
self.max_alibi_bias = 8.0
def set_vocab(self):
self._set_vocab_gpt2()
def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_block_count(self.hparams["n_layer"])
self.gguf_writer.add_context_length(self.hparams["n_positions"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
self.gguf_writer.add_head_count(self.hparams["n_head"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_file_type(self.ftype)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
tensors: list[tuple[str, Tensor]] = []
# we don't need these
if name.endswith((".attn.bias")):
return tensors
if name.endswith(("relative_pe.slopes")):
# Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
# Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
# but Jais's PyTorch model simply precalculates the slope values and places them
# in relative_pes.slopes
n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
first_val = float(data_torch._data[0])
self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
return tensors
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
data_torch = data_torch.transpose(1, 0)
new_name = self.map_tensor_name(name)
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
tensors.append((new_name, data_torch * self.embeddings_scale))
if self.output_is_wte:
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
assert not self.output_is_wte
tensors.append((new_name, data_torch * self.width_scale))
else:
tensors.append((new_name, data_torch))
return tensors
def write_tensors(self):
super().write_tensors()
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
###### CONVERSION LOGIC ######
@@ -3261,44 +2799,10 @@ def parse_args() -> argparse.Namespace:
"--verbose", action="store_true",
help="increase output verbosity",
)
parser.add_argument(
"--split-max-tensors", type=int, default=0,
help="max tensors in each split",
)
parser.add_argument(
"--split-max-size", type=str, default="0",
help="max size per split N(M|G)",
)
parser.add_argument(
"--dry-run", action="store_true",
help="only print out a split plan and exit, without writing any new files",
)
parser.add_argument(
"--no-tensor-first-split", action="store_true",
help="do not add tensors to the first split (disabled by default)"
)
return parser.parse_args()
def split_str_to_n_bytes(split_str: str) -> int:
if split_str.endswith("K"):
n = int(split_str[:-1]) * 1000
elif split_str.endswith("M"):
n = int(split_str[:-1]) * 1000 * 1000
elif split_str.endswith("G"):
n = int(split_str[:-1]) * 1000 * 1000 * 1000
elif split_str.isnumeric():
n = int(split_str)
else:
raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
if n < 0:
raise ValueError(f"Invalid split size: {split_str}, must be positive")
return n
def main() -> None:
args = parse_args()
@@ -3331,11 +2835,6 @@ def main() -> None:
"auto": gguf.LlamaFileType.GUESSED,
}
is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
if args.use_temp_file and is_split:
logger.error("Error: Cannot use temp file when splitting")
sys.exit(1)
if args.outfile is not None:
fname_out = args.outfile
else:
@@ -3353,10 +2852,7 @@ def main() -> None:
logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1)
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file,
args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors,
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
small_first_shard=args.no_tensor_first_split)
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
logger.info("Set model parameters")
model_instance.set_gguf_parameters()
@@ -3367,14 +2863,13 @@ def main() -> None:
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
if args.vocab_only:
logger.info("Exporting model vocab...")
logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
model_instance.write_vocab()
logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
else:
logger.info("Exporting model...")
logger.info(f"Exporting model to '{model_instance.fname_out}'")
model_instance.write()
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
logger.info(f"Model successfully exported to {out_path}")
logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
if __name__ == '__main__':

View File

@@ -30,8 +30,8 @@ We recommend using openmp since it's easier to modify the cores being used.
Makefile:
```bash
make GGML_BLIS=1 -j
# make GGML_BLIS=1 llama-benchmark-matmult
make LLAMA_BLIS=1 -j
# make LLAMA_BLIS=1 benchmark-matmult
```
CMake:
@@ -39,7 +39,7 @@ CMake:
```bash
mkdir build
cd build
cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ..
cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
make -j
```

View File

@@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
### 1. Convert the model to GGUF
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
Depending on the model architecture, you can use either [convert_hf_to_gguf.py](../convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](../examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
@@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
## GGUF specification

View File

@@ -3,7 +3,7 @@
## Verifying that the model is running on the GPU with CUDA
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
```shell
./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
```
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
@@ -27,7 +27,7 @@ RAM: 32GB
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
Result:

View File

@@ -12,45 +12,43 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN)
else()
add_subdirectory(cvector-generator)
add_subdirectory(baby-llama)
add_subdirectory(batched-bench)
add_subdirectory(batched)
add_subdirectory(batched-bench)
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
add_subdirectory(eval-callback)
add_subdirectory(export-lora)
add_subdirectory(finetune)
add_subdirectory(gbnf-validator)
add_subdirectory(gguf-split)
add_subdirectory(gguf)
add_subdirectory(gritlm)
add_subdirectory(imatrix)
add_subdirectory(gguf-split)
add_subdirectory(infill)
add_subdirectory(llama-bench)
add_subdirectory(llava)
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(main)
add_subdirectory(parallel)
add_subdirectory(passkey)
add_subdirectory(perplexity)
add_subdirectory(quantize-stats)
add_subdirectory(quantize)
add_subdirectory(retrieval)
if (GGML_RPC)
add_subdirectory(rpc)
endif()
if (LLAMA_BUILD_SERVER)
add_subdirectory(server)
endif()
if (GGML_SYCL)
if (LLAMA_SYCL)
add_subdirectory(sycl)
endif()
add_subdirectory(main)
add_subdirectory(tokenize)
add_subdirectory(parallel)
add_subdirectory(perplexity)
add_subdirectory(quantize)
add_subdirectory(quantize-stats)
add_subdirectory(retrieval)
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(passkey)
add_subdirectory(speculative)
add_subdirectory(tokenize)
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(gguf)
add_subdirectory(train-text-from-scratch)
add_subdirectory(imatrix)
if (LLAMA_BUILD_SERVER)
add_subdirectory(server)
endif()
add_subdirectory(export-lora)
if (LLAMA_RPC)
add_subdirectory(rpc)
endif()
endif()

View File

@@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD")
fi
./llama-cli "${GEN_OPTIONS[@]}" \
./main "${GEN_OPTIONS[@]}" \
--model "$MODEL" \
--in-prefix " " \
--in-suffix "${AI_NAME}:" \

19
examples/alpaca.sh Executable file
View File

@@ -0,0 +1,19 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
--color \
-f ./prompts/alpaca.txt \
--ctx_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
--temp 0.2 \
--repeat_penalty 1.1 \
-t 7

View File

@@ -1,4 +1,4 @@
set(TARGET llama-baby-llama)
set(TARGET baby-llama)
add_executable(${TARGET} baby-llama.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View File

@@ -58,4 +58,4 @@ echo "$2
model=$1
# generate the most likely continuation until the string "===" is found
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs

View File

@@ -1,4 +1,4 @@
set(TARGET llama-batched-bench)
set(TARGET batched-bench)
add_executable(${TARGET} batched-bench.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View File

@@ -10,16 +10,16 @@ There are 2 modes of operation:
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
```bash
./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
# custom set of batches
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
```
## Sample results

View File

@@ -1,6 +1,6 @@
.PHONY: build
build:
xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
rm -f ./llama-batched-swift
ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift
xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
rm -f ./batched_swift
ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift

View File

@@ -4,7 +4,7 @@
import PackageDescription
let package = Package(
name: "llama-batched-swift",
name: "batched_swift",
platforms: [.macOS(.v12)],
dependencies: [
.package(name: "llama", path: "../../"),
@@ -13,7 +13,7 @@ let package = Package(
// Targets are the basic building blocks of a package, defining a module or a test suite.
// Targets can depend on other targets in this package and products from dependencies.
.executableTarget(
name: "llama-batched-swift",
name: "batched_swift",
dependencies: ["llama"],
path: "Sources",
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]

View File

@@ -1,4 +1,4 @@
This is a swift clone of `examples/batched`.
$ `make`
$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`
$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`

View File

@@ -1,4 +1,4 @@
set(TARGET llama-batched)
set(TARGET batched)
add_executable(${TARGET} batched.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View File

@@ -3,7 +3,7 @@
The example demonstrates batched generation from a given prompt
```bash
./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
...

View File

@@ -93,34 +93,14 @@ int main(int argc, char ** argv) {
// create a llama_batch
// we use this object to submit token data for decoding
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
std::vector<llama_seq_id> seq_ids(n_parallel, 0);
for (int32_t i = 0; i < n_parallel; ++i) {
seq_ids[i] = i;
}
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
// evaluate the initial prompt
for (size_t i = 0; i < tokens_list.size(); ++i) {
llama_batch_add(batch, tokens_list[i], i, seq_ids, false);
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
}
GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
if (llama_model_has_encoder(model)) {
if (llama_encode(ctx, batch)) {
LOG_TEE("%s : failed to eval\n", __func__);
return 1;
}
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == -1) {
decoder_start_token_id = llama_token_bos(model);
}
llama_batch_clear(batch);
llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
}
// llama_decode will output logits only for the last token of the prompt
batch.logits[batch.n_tokens - 1] = true;
@@ -129,11 +109,11 @@ int main(int argc, char ** argv) {
return 1;
}
//// assign the system KV cache to all parallel sequences
//// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
//for (int32_t i = 1; i < n_parallel; ++i) {
// llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
//}
// assign the system KV cache to all parallel sequences
// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
for (int32_t i = 1; i < n_parallel; ++i) {
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
if (n_parallel > 1) {
LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);

View File

@@ -1,4 +1,4 @@
set(TARGET llama-bench-matmult)
set(TARGET benchmark)
add_executable(${TARGET} benchmark-matmult.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})

View File

@@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./llama-cli $GEN_OPTIONS \
./main $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \

View File

@@ -62,7 +62,7 @@ fi
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
echo 'Prompt cache does not exist, building...'
# Default batch_size to 64 here for better user feedback during initial prompt processing
./llama-cli 2>>"$LOG" \
./main 2>>"$LOG" \
--batch_size 64 \
"${OPTS[@]}" \
--prompt-cache "$PROMPT_CACHE_FILE" \
@@ -109,13 +109,13 @@ while read -e line; do
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
./llama-cli 2>>"$LOG" "${OPTS[@]}" \
./main 2>>"$LOG" "${OPTS[@]}" \
--prompt-cache "$CUR_PROMPT_CACHE" \
--prompt-cache-all \
--file "$CUR_PROMPT_FILE" \
--reverse-prompt "${USER_NAME}:" \
--n_predict "$n_predict" |
skip_bytes 1 | # skip BOS token added by ./llama-cli
skip_bytes 1 | # skip BOS token added by ./main
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
skip_bytes "$n_prompt_len_pre" # print generation
@@ -133,7 +133,7 @@ while read -e line; do
# TODO get both messages in one go
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
echo >&2 "Couldn't get number of tokens from ./main output!"
exit 1
fi
@@ -144,7 +144,7 @@ while read -e line; do
fi
# Update cache for next prompt in background, ideally during user input
./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
--prompt-cache "$NEXT_PROMPT_CACHE" \
--file "$NEXT_PROMPT_FILE" \
--n_predict 1 &

View File

@@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./bin/llama-cli $GEN_OPTIONS \
./bin/main $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \

View File

@@ -11,6 +11,6 @@ cd ..
#
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
#
./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
--repeat_penalty 1.0 --color -i \
-r "User:" -f prompts/chat-with-bob.txt

View File

@@ -1,4 +1,4 @@
set(TARGET llama-convert-llama2c-to-ggml)
set(TARGET convert-llama2c-to-ggml)
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View File

@@ -8,7 +8,7 @@ To convert the model first download the models from the [llama2.c](https://githu
After successful compilation, following usage options are available:
```
usage: ./llama-convert-llama2c-to-ggml [options]
usage: ./convert-llama2c-to-ggml [options]
options:
-h, --help show this help message and exit
@@ -19,10 +19,10 @@ options:
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
Now you can use the model with a command like:
`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`

View File

@@ -1,5 +0,0 @@
set(TARGET llama-cvector-generator)
add_executable(${TARGET} cvector-generator.cpp pca.hpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,45 +0,0 @@
# cvector-generator
This example demonstrates how to generate a control vector using gguf models.
Related PRs:
- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)
## Examples
```sh
# CPU only
./cvector-generator -m ./llama-3.Q4_K_M.gguf
# With GPU
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99
# With advanced options
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
# Using mean value instead of PCA
./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean
# To see help message
./cvector-generator -h
# Then, have a look at "cvector" section
```
## Tips and tricks
If you have multiple lines per prompt, you can escape the newline character (change it to `\n`). For example:
```
<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
<|im_start|>system\nYou are in a very good mood today<|im_end|>
```
Example to use output file with `llama-cli`:
(Tips: The control vector works better when apply to layers higher than 10)
```sh
./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
```

View File

@@ -1,582 +0,0 @@
That game
I can see
Hmm, this
I can relate to
Who is
I understand the
Ugh,
What the hell was
Hey, did anyone
Although
Thank you for choosing
What are you
Oh w
How dare you open
It was my pleasure
I'm hon
I appreciate that you
Are you k
Whoever left this
It's always
Ew,
Hey, I l
Hello? Is someone
I understand that
That poem
Aww, poor
Hey, it
Alright, who
I didn't
Well, life
The document
Oh no, this
I'm concerned
Hello, this is
This art
Hmm, this drink
Hi there!
It seems
Is
Good
I can't
Ex
Who are
I can see that
Wow,
Today is a
Hey friend
Sometimes friends
Oh, this old
The weather outside
This place is sur
I appreciate your input
Thank you for the
Look at
I'm disappoint
To my
How dare you
That's an
This piece of art
Eww
This park is
This is incredible
Oh no, someone
Exc
Well, it'
I warned
Hey, I understand
Hey, I saw
How dare you go
What the he
Hey
It's
Hello? Hello?
It
Oh no!
This is the perfect
Good morning,
Oh no, there
It's so
Yeah
Uh,
Hello everyone
Who turned off
The weather
Who'
Hey, this
Wait,
Eww, gross
Excuse
It seems like you
Thank you so
What happened?
Oh my g
I am deeply sad
I war
Okay, let'
Hey, that
That was a beautiful
Oh no! That
What happened
Hey there
The artist'
What?!
Hey, it'
I am disappoint
It seems like
Oh no! The
This park is a
If you
Yes! I did
It sounds
What
Who is it
Hmm, that
That's strange
Yeah, that was
That's interesting
This park
What the hell
Who is that
I feel like my
Oh well
What the hell is
Hello? Hello
To my dearest
Bless you!\"
Thank you for
Oh, looks like
Can you please
This place is
Eww, what
Bless you
Is everything
Hey, I just
Whoever left these
Well, that'
I feel
Hey, do you
It's sad
Oh no, it
Hey, that'
Oh my god,
Thank you,
Hello little one,
I apolog
Hey team, I
How dare you read
Who is this and
Whoever left
Hi there! W
A
If you have
I was
U
Bless
Well, this
Oh, I'
It's a
Eww,
Is everything okay?
Oh, I
Hello, can you
Al
That was a great
What are
I understand that not
Oh no, not
Who is it?\"
Hey, can we
Whoever is taking
I would love to
Hey, I noticed
Hey, could
I understand that there
Hello?
D
Oh man, I
Thank you so much
Oh no, my
Dear [Name
Uh
I remember
Hey, who
Well, it
Are you
I understand that it
Hey, is
I would
Who is this
Excuse me
Alright
I am thrilled
Sometimes friends have
Who the
It's interesting
I would love
E
Hello? Is anyone
Well, this is
This place
Well,
I warned you
Hey, watch where
Oh my
That'
Sometimes friends have different
I understand that everyone
What?
What do these notes
I can relate
I'm not
I understand
To my dear
Guys
Well
Hey, I appreciate
Wow, what
Dear
That melody
Who the hell
Today is
Hello little
Wow, look
That's great
Love is never wrong
I'm having
Whoa, did
Ugh
Can you please provide
I miss you,
I feel uncom
I know
Ugh, this
Hey, watch
Oh great, a
I didn
Okay
That game of char
Oh
I appreciate
Who's there
I am so
Oh great, someone
Hey, could you
I remember wondering
Wait, what?
What do
Hello? Can
Hey there,
That game of
This is incred
Oh my gosh
Oh great, f
I appreciate your
It sounds like
What the heck
Okay, I understand
Ew
I understand that this
Uh, hi
Hi everyone!
What the hell?
Thank you for your
Oh no, the
Wow, I
Who turned
Dear [
Whoever
This is a
Whoa, he
What in the world
Although the physical
Hello, who is
That's amaz
Hey, I know
Okay, that
Hi everyone
Hey, is everything
I understand your fr
Oh no, poor
Oh, look
Good morning
Ew, gross
Oh no, did
Look at the family
Hey team
Yes!
Hey, can I
Okay, that'
It's great
Love is
Hey, what
Good morning, world
Who is it?
That poem really reson
I
That's
I understand the task
Gu
Hello? Who'
This postcard is
Whoa,
Oh, that
I understand that I
Whoever is
Hello? Who is
I'm really
Wow, this
Can
This artwork really
This is a shame
I miss you too
Who are you?
Today is a difficult
Hey, just
Are you okay
I am
Hi,
Wow, that
Hey there! Can
Okay, stay
Oh great, just
Yeah,
Hello? Can you
Oh, looks
Thank you for sharing
I'm glad
Hey, is that
Hmm
It was my
It sounds like you
Wow, your
I was promised certain
That was such a
Thank
Excuse you
That was
Hey team,
I feel un
It was
What'
Hey friend, I
How
Saying goodbye
That
It's heart
How dare
Oh,
Hello, may
What's this
Thank you for recogn
Aww, that
Oh, I remember
Hmm, that'
I miss
I know this
Wait
Is everything okay
Who is that person
Wow, you
Oh great
I'm sad
Wow, the
I am very disappoint
Who turned off the
I understand that things
I'm very
Hi
That's very
Okay, I
Oh no,
Wow, there
What's wrong
I apologize for
Hey, I
Can I help you
Oh, I didn
Alright,
Oh wow,
Oh my goodness
I know this event
What in the
Saying
Yeah, that
Guys, I
Hey, this v
This post
Are
Hey, can
Hello? Is
I can only imagine
Oh, that sounds
Hey, is anyone
I am disappointed
Hello,
Hey everyone, I
That was such
It's okay
The artist
Whoa
I understand that mistakes
Can I help
Who
Hi everyone! I
Hey, can you
Wow, how
Today
Oh no, I
Oh well, I
Well, that
This is the
Yes! I finally
Hey there little
Hello everyone!
Love is never
Look at the
This postcard
Oh great,
Can I
Hmm, this is
I understand your
Oh, look at
B
I'm so
Whoa, this
W
Oh, this
Sometimes
This piece of
What the
That was a
Hey, do
Oh no
Whoa, what
I feel like I
The documentary
Hello
Hello little one
I understand that my
Eww, that
Wow, an
Yes! Finally,
Although the physical location
Whoever is watching
That movie
I remember wondering about
Hey there, little
Who's
Hello, who
Hello everyone! Thank
Hello, can
That's too
Hey, just wanted
Hey there, I
Saying good
Hey there!
Who is there?
Oh my good
I am very
Oh no, what
Wow, thank
I was promised
Hi, is
Hey, I'
Guys, the
Oh no, that
Who is there
Hello, this
That movie really touched
If you have something
The documentary was
I'm starting
Are you kidd
That movie really
Hey everyone,
Thank you for considering
I didn'
Yes! I
Can you
Oh my god
Hey, whoever
That melody really
Thank you, little
Hello, may I
Look
Wow, we
It looks
What do these
Oh wow
I apologize
What are you all
It's such
It's clear
Hey, I was
Hey friend,
I can only
The weather outside is
Eww, this
I miss you
Wow
Aww,
Hi, is there
This artwork
Okay,
Oh well,
This
I'
Say
Hey there little gu
Hmm,
Whoa, who
I am thr
Oh man
Okay, stay calm
I'm happy
Oh, this cur
Oh man,
I'm sorry
Hello? Who
What?! That
This piece
Hey everyone
That's so
Are you okay?
What happened? Where
Hi there
The
Who the hell entered
I can
Guys,
What's
What in
It's important
I'm
I'm coming
It'
Yes! Finally
Wait, what
Wow, reading
I'm surprised
Hey, did
Hey,
Okay, let
I understand that you
Who the hell threw
Eww, who
Thank you for thinking
Who is this?\"
I am deeply
Thank you for including
Oh no, an
It looks like you
Aww
I'm confused
Wow, it
That poem really
Yes
Hey there, is
Hey, what'
Thank you for remember
To
This is
Thank you for making
I can'
That mel
Wow, they
I feel like
Although the
Who are you
Love
If
What the hell are
I am so sad
Oh, I found
Thank you
It looks like
Well, life is
I appreciate that
The artist's
Whoa, that
It's never

View File

@@ -1,503 +0,0 @@
#include "common.h"
#include "llama.h"
#include "ggml.h"
#include "pca.hpp"
#include "mean.hpp"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#include <cstdio>
#include <string>
#include <tuple>
#include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <climits>
//////////////////////////////////////////////////
// utils
template <class Iter>
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
std::string ret;
for (; begin != end; ++begin) {
ret += llama_token_to_piece(ctx, *begin);
}
return ret;
}
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
printf("\nexample usage:\n");
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
printf("\n");
}
//////////////////////////////////////////////////
// cb_eval is reused for each pair of positive - negative prompt
struct callback_data {
ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered
int n_layers = 0;
int n_tokens = 0;
bool is_eval_pos = true;
// each element of the vector correspond to one layer
std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
std::vector<struct ggml_tensor *> v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
// save a tensor into either v_pos or v_neg (decided by is_eval_pos)
void save_tensor_for_layer(struct ggml_tensor * t) {
GGML_ASSERT(t->type == GGML_TYPE_F32);
if (ctx_ggml == nullptr) {
// alloc a new ctx_ggml if needed
struct ggml_init_params params_ggml = {
/*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ctx_ggml = ggml_init(params_ggml);
}
// copy tensor data
auto n_bytes = ggml_nbytes(t);
struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
ggml_set_name(t_layer, ggml_get_name(t));
//print_debug_tensor(t_layer);
if (is_eval_pos) {
v_pos.push_back(t_layer);
} else {
v_neg.push_back(t_layer);
}
}
// calculate diff (v_pos - v_neg) and place the result back to v_pos
// all zero rows in the diff tensor will also be removed
// NOTE: final layer is ignored. we only have (n_layers - 1) to process
std::vector<struct ggml_tensor *> calc_diff() {
for (float il = 0; il < v_pos.size(); il++) {
float * a = (float *) v_pos[il]->data;
float * b = (float *) v_neg[il]->data;
size_t n_elem = ggml_nelements(v_pos[il]);
for (size_t j = 0; j < n_elem; j++) {
a[j] -= b[j];
}
//print_debug_tensor(v_pos[i]);
auto diff_filtered = filter_nonzero_rows(v_pos[il]);
v_diff_filtered.push_back(diff_filtered);
}
return v_diff_filtered; // for convinient, we return the result std::vector
}
// delete zero rows from a given 2D tensor
struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
//printf("filter_nonzero_rows\n");
auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
// check if given row containing all zero elements
int n_cols = t->ne[0]; // hint: should be equal to n_embd
for (int col = 0; col < n_cols; ++col) {
if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
return false;
}
}
return true;
};
std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
for (int i_row = 0; i_row < a->ne[1]; i_row++) {
if (!is_row_all_zeros(a, i_row, 1e-6)) {
rows_to_copy.push_back(i_row);
}
}
// get "n_nonzero_rows" for the output "diff_filtered"
int n_nonzero_rows = rows_to_copy.size();
//printf("n_nonzero_rows: %d\n", n_nonzero_rows);
int n_embd = a->ne[0];
GGML_ASSERT(n_nonzero_rows > 0);
// diff_filtered: [n_embd, n_nonzero_rows]
struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
// copy non-zero rows
for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
int src_row = rows_to_copy[dest_row];
for (int i = 0; i < n_embd; i++) {
float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
}
}
//print_debug_tensor(diff_filtered);
return diff_filtered;
}
// we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
void reset() {
for (auto ptr : v_pos) free(ptr->data);
for (auto ptr : v_neg) free(ptr->data);
for (auto ptr : v_diff_filtered) free(ptr->data);
v_pos.clear();
v_neg.clear();
v_diff_filtered.clear();
if (ctx_ggml) {
ggml_free(ctx_ggml);
}
ctx_ggml = nullptr;
}
};
/**
* process_ctx is used to store the ggml context for pre-post processing the diff vectors
* in short, input => v_diff and output => v_final
*/
struct train_context {
ggml_context * ctx_ggml;
int n_embd;
int n_layers;
/* pair of prompts to be used for generating final vector */
std::vector<std::string> positive_entries;
std::vector<std::string> negative_entries;
// each element of the vector correspond to one layer
// NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
// NOTE (2): v_diff is transposed from v_diff_tmp
std::vector<struct ggml_tensor *> v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
// to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
// v_diff_tmp will get converted unto v_diff later on
std::vector<std::vector<uint8_t>> v_diff_tmp;
train_context(int n_embd_, int n_layers_) {
n_embd = n_embd_;
n_layers = n_layers_;
struct ggml_init_params params_ggml = {
/*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ctx_ggml = ggml_init(params_ggml);
for (int il = 0; il < n_layers - 1; il++) {
std::vector<uint8_t> empty;
v_diff_tmp.push_back(empty);
auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
v_final.push_back(t);
}
}
// add new rows into existing tensor in v_diff_tmp
void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
for (int il = 0; il < n_layers - 1; il++) {
auto t = diff_filtered[il];
auto & diff_tmp = v_diff_tmp[il];
size_t curr_size = diff_tmp.size();
diff_tmp.resize(curr_size + ggml_nbytes(t));
memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
}
}
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
void build_v_diff(bool transpose) {
printf("build_v_diff\n");
for (int il = 0; il < n_layers - 1; il++) {
auto & diff_tmp = v_diff_tmp[il];
int n_elem = diff_tmp.size() / sizeof(float);
GGML_ASSERT(n_elem % n_embd == 0);
int n_rows = n_elem / n_embd;
struct ggml_tensor * diff = transpose
? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
: ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
if (transpose) {
// copy data & transpose
float * arr = (float *) diff_tmp.data();
for (int ir = 0; ir < n_rows; ++ir) {
for (int ic = 0; ic < n_embd; ++ic) {
float f = arr[ir*n_embd + ic];
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
}
}
} else {
// only copy
memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
}
v_diff.push_back(diff);
print_debug_tensor(diff);
// free memory of diff_tmp
diff_tmp.resize(0);
}
}
~train_context() {
for (auto ptr : v_final) free(ptr->data);
for (auto ptr : v_diff) free(ptr->data);
// no need to free v_diff_tmp, since we didn't use malloc
ggml_free(ctx_ggml);
}
};
struct tokenized_prompt {
std::vector<llama_token> tokens_pos;
std::vector<llama_token> tokens_neg;
size_t max_seq_len;
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
padding_seq(ctx, tokens_pos, max_seq_len);
padding_seq(ctx, tokens_neg, max_seq_len);
}
void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
// TODO: customize padding token
std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
llama_token pad_tok = pad_tokens.back();
while (tokens.size() < len) {
tokens.push_back(pad_tok);
}
}
};
//////////////////////////////////////////////////
template <typename T>
static std::string to_string(const T & val) {
std::stringstream ss;
ss << val;
return ss.str();
}
static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
std::vector<std::string> output;
std::ifstream file(path);
if (!file.is_open()) {
fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
exit(1);
}
std::string line;
while (std::getline(file, line)) {
bool is_skip = skip_empty_lines && line.empty();
if (!is_skip) {
string_process_escapes(line);
output.push_back(line);
}
}
file.close();
return output;
}
//////////////////////////////////////////////////
static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (callback_data *) user_data;
static const char * l_out_name = "l_out";
const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
if (ask) {
return is_l_out;
}
if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
return true;
}
// save the tensor to current context
cb_data->save_tensor_for_layer(t);
return true;
}
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
llama_kv_cache_clear(ctx);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
return true;
}
static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
struct gguf_context * ctx = gguf_init_empty();
const std::string arch = "controlvector";
gguf_set_val_str(ctx, "general.architecture", arch.c_str());
gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
for (size_t i = 0; i < v_ctrl.size(); ++i) {
gguf_add_tensor(ctx, v_ctrl[i]);
print_debug_tensor(v_ctrl[i]);
printf("Added tensor: %s\n", v_ctrl[i]->name);
}
printf("%s: writing file...\n", __func__);
gguf_write_to_file(ctx, fname.c_str(), false);
printf("%s: wrote file '%s'\n", __func__, fname.c_str());
gguf_free(ctx);
}
/**
* Load prompt files and completion file.
* Then format each pair of prompt + completion to make an entry.
*/
static int prepare_entries(gpt_params & params, train_context & ctx_train) {
// load prompts
std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
if (positive_prompts.size() != negative_prompts.size()) {
fprintf(stderr, "number of positive and negative prompts must be equal\n");
return 1;
}
if (positive_prompts.empty()) {
fprintf(stderr, "must provide at least one prompt pair\n");
return 1;
}
ctx_train.positive_entries = positive_prompts;
ctx_train.negative_entries = negative_prompts;
return 0;
}
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
}
if (params.n_pca_iterations % params.n_pca_batch != 0) {
fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
return 1;
}
callback_data cb_data;
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
params.cb_eval = cb_eval;
params.cb_eval_user_data = &cb_data;
params.warmup = false;
print_build_info();
llama_backend_init();
llama_numa_init(params.numa);
// load the model to get hparams
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
// int n_ctx = llama_n_ctx(ctx);
int n_layers = llama_n_layer(model);
int n_embd = llama_n_embd(model);
// get model hint param (a.k.a model arch name)
char model_hint[128];
llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
// init train_context
train_context ctx_train(n_embd, n_layers);
// load and prepare entries for training
prepare_entries(params, ctx_train);
// we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
std::vector<tokenized_prompt> tokenized_prompts;
size_t n_total_tokens = 0;
for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
n_total_tokens += 2 * t.max_seq_len;
tokenized_prompts.push_back(std::move(t));
}
std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
bool success = false;
tokenized_prompt t = tokenized_prompts[i];
cb_data.n_layers = n_layers;
cb_data.n_tokens = t.max_seq_len;
printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
(int) i+1, (int) ctx_train.positive_entries.size(),
tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
(int) t.max_seq_len);
cb_data.is_eval_pos = true;
success = get_hidden_layers(ctx, t.tokens_pos);
if (!success) break;
cb_data.is_eval_pos = false;
success = get_hidden_layers(ctx, t.tokens_neg);
if (!success) break;
// calculate diff and remove all zero rows
auto v_diff_filtered = cb_data.calc_diff();
// save & concat the filtered v_diff to ctx_train
ctx_train.concat_diff_tmp(v_diff_filtered);
// reset for next iteration
cb_data.reset();
}
// done with the model, we can now free it to make gain some memory
printf("Done evaluate prompts, unload model...\n");
llama_free(ctx);
llama_free_model(model);
bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
// prepare ctx_train for PCA
ctx_train.build_v_diff(use_pca);
if (use_pca) {
// run PCA
PCA::pca_params pca_params;
pca_params.n_threads = params.n_threads;
pca_params.n_batch = params.n_pca_batch;
pca_params.n_iterations = params.n_pca_iterations;
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
} else {
// run mean
mean::run(ctx_train.v_diff, ctx_train.v_final);
}
// write output vectors to gguf
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
llama_backend_free();
return 0;
}

View File

@@ -1,48 +0,0 @@
#include "common.h"
#include "llama.h"
#include "ggml.h"
#include <string>
#include <vector>
#include <math.h>
namespace mean {
static void run(
const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_embd, n_samples]
const std::vector<struct ggml_tensor *> & v_output) {
printf("%s: Running mean...\n", __func__);
for (size_t il = 0; il < v_input.size(); ++il) {
// prepare output vector
struct ggml_tensor * ctrl_out = v_output[il];
ggml_format_name(ctrl_out, "direction.%ld", il+1);
// calculate mean vector
struct ggml_tensor * t_layer = v_input[il];
GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd
for (int ic = 0; ic < t_layer->ne[0]; ic++) {
float f = 0.0;
for (int ir = 0; ir < t_layer->ne[1]; ir++) {
f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0);
}
f /= t_layer->ne[1];
ggml_set_f32_1d(ctrl_out, ic, f);
}
// normalize output vector
float norm = 0.0;
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
float f = ggml_get_f32_1d(ctrl_out, i);
norm += f*f;
}
norm = sqrt(norm);
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
float f = ggml_get_f32_1d(ctrl_out, i);
ggml_set_f32_1d(ctrl_out, i, f / norm);
}
printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
}
}
}

View File

@@ -1,4 +0,0 @@
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me
<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow

View File

@@ -1,325 +0,0 @@
#include "common.h"
#include "llama.h"
#include "ggml.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#include <cstdio>
#include <ctime>
#include <string>
#include <tuple>
#include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#define DEBUG_POS 5
static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
if (!with_data) return;
printf("%s: %s[0] = [", __func__, t->name);
for (size_t i = 0; i <= DEBUG_POS; i++) {
printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
}
printf(" ... ]\n");
}
namespace PCA {
// input params for PCA computations
struct pca_params {
int n_threads = 1;
int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
int n_iterations = 1000;
float tolerance = 1e-7;
// for debugging
int i_layer = 0;
int n_layers = 0;
};
// result from each iteration
struct pca_result {
struct ggml_tensor * calculated_square = NULL;
std::vector<struct ggml_tensor *> eigenvectors;
std::vector<float> distances;
};
struct pca_model {
ggml_backend_t backend = NULL;
ggml_backend_buffer_t buffer;
struct ggml_context * ctx; // context to compute graph on target device
struct ggml_context * ctx_host; // host context to store results
// tensors on target device
struct ggml_tensor * dev_input;
struct ggml_tensor * dev_square;
struct ggml_tensor * dev_eigenvector;
pca_model(struct ggml_tensor * t_input) {
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
#endif
// TODO: enable Metal support when support for GGML_OP_SQRT is added
// #ifdef GGML_USE_METAL
// fprintf(stderr, "%s: using Metal backend\n", __func__);
// backend = ggml_backend_metal_init();
// if (!backend) {
// fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
// }
// #endif
// if there aren't GPU Backends fallback to CPU backend
if (!backend) {
backend = ggml_backend_cpu_init();
}
const int num_tensors = 4;
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ctx = ggml_init(params);
auto n_samples = t_input->ne[0];
auto n_embd = t_input->ne[1];
dev_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
dev_square = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
ggml_set_name(dev_input, "dev_input");
ggml_set_name(dev_square, "dev_square");
ggml_set_name(dev_eigenvector, "dev_eigenvector");
buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
// initialize eigenvector to random normalized vector
{
std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
std::uniform_real_distribution<float> distribution(0.0, 1.0);
float sum_sqr = 0.0; // for normalizing random_vec
for (size_t i = 0; i < random_vec.size(); ++i) {
float f = distribution(generator);
sum_sqr += f * f;
random_vec[i] = f;
}
// normalize it
float random_vec_norm = std::sqrt(sum_sqr);
for (size_t i = 0; i < random_vec.size(); ++i) {
random_vec[i] /= random_vec_norm;
}
ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
}
}
~pca_model() {
ggml_free(ctx);
ggml_backend_buffer_free(buffer);
ggml_backend_free(backend);
}
};
static struct ggml_cgraph * build_graph_piter(
const struct pca_params & params,
const pca_model & model,
bool calc_square = false) {
GGML_ASSERT(params.n_batch > 0);
// TODO: buf_size must be able to scale with params.n_batch
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
// create a temporally context to build the graph
struct ggml_context * ctx0 = ggml_init(params0);
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
// turn v_diff_original into square matrix if needed
struct ggml_tensor * tmp_square;
if (calc_square) {
tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
ggml_set_name(tmp_square, "tmp_square");
}
struct ggml_tensor * b_tensor;
struct ggml_tensor * distance;
struct ggml_tensor * old_eigen = model.dev_eigenvector;
struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
for (int i = 0; i < params.n_batch; ++i) {
// b_tensor = square * eigenvector^T
b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
ggml_set_name(b_tensor, "b_tensor");
// normalize
b_tensor = ggml_div_inplace(ctx0,
b_tensor,
ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
);
ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
// calculate distance(new eigenvector - old eigenvector)
// we don't use ggml_sub because it may not be implemented on GPU backend
struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
distance = ggml_sqrt_inplace(ctx0,
ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
ggml_format_name(distance, "distance_%d", i);
old_eigen = b_tensor;
// build operations nodes
ggml_build_forward_expand(gf, distance);
}
// delete the temporally context used to build the graph
ggml_free(ctx0);
return gf;
}
static ggml_status compute_piter(
const struct pca_params & params,
const pca_model & model,
struct ggml_cgraph * gf,
ggml_gallocr_t allocr,
struct pca_result & result) {
// allocate tensors
ggml_gallocr_alloc_graph(allocr, gf);
if (ggml_backend_is_cpu(model.backend)) {
ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
}
// TODO: enable GPU support when support for GGML_OP_SQRT is added
//#ifdef GGML_USE_METAL
// if (ggml_backend_is_metal(model.backend)) {
// ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
// }
//#endif
ggml_status res = ggml_backend_graph_compute(model.backend, gf);
if (res == GGML_STATUS_SUCCESS) {
auto extract_i = [](std::string prefix, std::string str) -> int {
int i = -1;
if (str.rfind(prefix, 0) == 0) {
sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
}
return i;
};
result.calculated_square = NULL;
result.eigenvectors.clear();
result.distances.clear();
result.eigenvectors.resize(params.n_batch);
result.distances.resize(params.n_batch);
// get output nodes
for (int i = 0; i < gf->n_nodes; ++i) {
auto node = gf->nodes[i];
int iter = -1;
// find b_tensor (without copying data from device)
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
result.eigenvectors[iter] = node;
}
// find distances, then copy data from device
if ((iter = extract_i("distance_", node->name)) > -1) {
float d;
ggml_backend_tensor_get(node, &d, 0, sizeof(float));
result.distances[iter] = d;
// std::cout << node->name << " = " << d << "\n";
}
// find tmp_square if it exists (without copying data from device)
if (std::string(node->name) == "tmp_square") {
result.calculated_square = node;
}
}
}
return res;
}
static void power_iteration(
const struct pca_params & params,
struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
struct ggml_tensor * output) {
//printf("in power iteration\n");
struct pca_model model(input);
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
struct pca_result result;
struct ggml_tensor * last_eigenvector = NULL;
int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
for (int iter = 0; iter < n_iters; ++iter) {
bool calc_square = (iter == 0); // only need to calculate square for first iteration
struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
// ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
compute_piter(params, model, gf, allocr, result);
for (size_t k = 0; k < result.distances.size(); ++k) {
last_eigenvector = result.eigenvectors[k];
if (result.distances[k] < params.tolerance) {
break; // done
}
}
if (calc_square) {
// copy and store the square matrix if needed
GGML_ASSERT(result.calculated_square != NULL);
ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
}
{
// copy last eigen vector and store as input for next iteration
GGML_ASSERT(last_eigenvector != NULL);
ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
}
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
__func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
}
// get output tensor
GGML_ASSERT(last_eigenvector);
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
//print_debug_tensor(output);
ggml_gallocr_free(allocr);
// TODO @ngxson : The output vector is randomly inverted
// Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
}
static void run_pca(
struct pca_params & params,
const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
const std::vector<struct ggml_tensor *> & v_output) {
printf("%s: Running PCA...\n", __func__);
for (size_t il = 0; il < v_input.size(); ++il) {
// prepare output vector
struct ggml_tensor * ctrl_out = v_output[il];
ggml_format_name(ctrl_out, "direction.%ld", il+1);
// run power_iteration
params.i_layer = il;
params.n_layers = v_input.size();
power_iteration(params, v_input[il], ctrl_out);
printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
}
}
}

View File

@@ -1,4 +0,0 @@
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!

View File

@@ -1,4 +1,4 @@
set(TARGET llama-embedding)
set(TARGET embedding)
add_executable(${TARGET} embedding.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View File

@@ -9,52 +9,13 @@ To get started right away, run the following command, making sure to use the cor
### Unix-based systems (Linux, macOS, etc.):
```bash
./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
```
### Windows:
```powershell
llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
```
The above command will output space-separated float values.
## extra parameters
### --embd-normalize $integer$
| $integer$ | description | formula |
|-----------|---------------------|---------|
| $-1$ | none |
| $0$ | max absolute int16 | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$
| $1$ | taxicab | $\Large{x_i \over\sum \lvert x_i\rvert}$
| $2$ | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$
| $>2$ | p-norm | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$
### --embd-output-format $'string'$
| $'string'$ | description | |
|------------|------------------------------|--|
| '' | same as before | (default)
| 'array' | single embeddings | $[[x_1,...,x_n]]$
| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
| 'json' | openai style |
| 'json+' | add cosine similarity matrix |
### --embd-separator $"string"$
| $"string"$ | |
|--------------|-|
| "\n" | (default)
| "<#embSep#>" | for exemple
| "<#sep#>" | other exemple
## examples
### Unix-based systems (Linux, macOS, etc.):
```bash
./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
```
### Windows:
```powershell
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
```

View File

@@ -7,30 +7,23 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
static std::vector<std::string> split_lines(const std::string & s) {
std::string line;
std::vector<std::string> lines;
size_t start = 0;
size_t end = s.find(separator);
while (end != std::string::npos) {
lines.push_back(s.substr(start, end - start));
start = end + separator.length();
end = s.find(separator, start);
std::stringstream ss(s);
while (std::getline(ss, line)) {
lines.push_back(line);
}
lines.push_back(s.substr(start)); // Add the last part
return lines;
}
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
size_t n_tokens = tokens.size();
for (size_t i = 0; i < n_tokens; i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
for (size_t i = 0; i < tokens.size(); i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
}
}
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx);
@@ -47,10 +40,22 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
// try to get sequence embeddings - supported only when pooling_type is not NONE
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
if (embd == NULL) {
embd = llama_get_embeddings_ith(ctx, i);
if (embd == NULL) {
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
continue;
}
}
float * out = output + batch.seq_id[i][0] * n_embd;
llama_embd_normalize(embd, out, n_embd, embd_norm);
//TODO: I would also add a parameter here to enable normalization or not.
/*fprintf(stdout, "unnormalized_embedding:");
for (int hh = 0; hh < n_embd; hh++) {
fprintf(stdout, "%9.6f ", embd[hh]);
}
fprintf(stdout, "\n");*/
llama_embd_normalize(embd, out, n_embd);
}
}
@@ -92,12 +97,6 @@ int main(int argc, char ** argv) {
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
return 1;
}
if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
@@ -110,7 +109,7 @@ int main(int argc, char ** argv) {
}
// split the prompt into lines
std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
std::vector<std::string> prompts = split_lines(params.prompt);
// max batch size
const uint64_t n_batch = params.n_batch;
@@ -170,7 +169,7 @@ int main(int argc, char ** argv) {
// encode if at capacity
if (batch.n_tokens + n_toks > n_batch) {
float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
batch_decode(ctx, batch, out, s, n_embd);
llama_batch_clear(batch);
p += s;
s = 0;
@@ -183,78 +182,29 @@ int main(int argc, char ** argv) {
// final batch
float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
batch_decode(ctx, batch, out, s, n_embd);
if (params.embd_out.empty()) {
// print the first part of the embeddings or for a single prompt, the full embedding
// print the first part of the embeddings or for a single prompt, the full embedding
fprintf(stdout, "\n");
for (int j = 0; j < n_prompts; j++) {
fprintf(stdout, "embedding %d: ", j);
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
}
fprintf(stdout, "\n");
for (int j = 0; j < n_prompts; j++) {
fprintf(stdout, "embedding %d: ", j);
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
if (params.embd_normalize == 0) {
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
} else {
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
}
}
fprintf(stdout, "\n");
}
// print cosine similarity matrix
if (n_prompts > 1) {
fprintf(stdout, "\n");
printf("cosine similarity matrix:\n\n");
for (int i = 0; i < n_prompts; i++) {
fprintf(stdout, "%6.6s ", prompts[i].c_str());
}
fprintf(stdout, "\n");
for (int i = 0; i < n_prompts; i++) {
for (int j = 0; j < n_prompts; j++) {
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f ", sim);
}
fprintf(stdout, "%1.10s", prompts[i].c_str());
fprintf(stdout, "\n");
}
}
}
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
const bool notArray = params.embd_out != "array";
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
for (int j = 0;;) { // at least one iteration (one prompt)
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
fprintf(stdout, "[");
for (int i = 0;;) { // at least one iteration (n_embd > 0)
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
i++;
if (i < n_embd) fprintf(stdout, ","); else break;
// print cosine similarity matrix
if (n_prompts > 1) {
fprintf(stdout, "\n");
printf("cosine similarity matrix:\n\n");
for (int i = 0; i < n_prompts; i++) {
for (int j = 0; j < n_prompts; j++) {
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f ", sim);
}
fprintf(stdout, notArray ? "]\n }" : "]");
j++;
if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
fprintf(stdout, "\n");
}
fprintf(stdout, notArray ? "\n ]" : "]\n");
if (params.embd_out == "json+" && n_prompts > 1) {
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
for (int i = 0;;) { // at least two iteration (n_prompts > 1)
fprintf(stdout, " [");
for (int j = 0;;) { // at least two iteration (n_prompts > 1)
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f", sim);
j++;
if (j < n_prompts) fprintf(stdout, ", "); else break;
}
fprintf(stdout, " ]");
i++;
if (i < n_prompts) fprintf(stdout, ",\n"); else break;
}
fprintf(stdout, "\n ]");
}
if (notArray) fprintf(stdout, "\n}\n");
}
// clean up

View File

@@ -1,9 +1,9 @@
set(TARGET llama-eval-callback)
set(TARGET eval-callback)
add_executable(${TARGET} eval-callback.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TEST_TARGET test-eval-callback)
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)

View File

@@ -6,7 +6,7 @@ It simply prints to the console all operations and tensor data.
Usage:
```shell
llama-eval-callback \
eval-callback \
--hf-repo ggml-org/models \
--hf-file phi-2/ggml-model-q4_0.gguf \
--model phi-2-q4_0.gguf \

View File

@@ -1,4 +1,4 @@
set(TARGET llama-export-lora)
set(TARGET export-lora)
add_executable(${TARGET} export-lora.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View File

@@ -3,7 +3,7 @@
Apply LORA adapters to base model and export the resulting model.
```
usage: llama-export-lora [options]
usage: export-lora [options]
options:
-h, --help show this help message and exit
@@ -17,7 +17,7 @@ options:
For example:
```bash
./bin/llama-export-lora \
./bin/export-lora \
-m open-llama-3b-v2-q8_0.gguf \
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin

View File

@@ -1,4 +1,4 @@
set(TARGET llama-finetune)
set(TARGET finetune)
add_executable(${TARGET} finetune.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View File

@@ -7,7 +7,7 @@ Basic usage instructions:
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
# finetune LORA adapter
./bin/llama-finetune \
./bin/finetune \
--model-base open-llama-3b-v2-q8_0.gguf \
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
@@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
--use-checkpointing
# predict
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
```
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
@@ -38,14 +38,14 @@ After 10 more iterations:
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
In `main` you can also load multiple LORA adapters, which will then be mixed together.
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
```bash
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
./bin/main -m open-llama-3b-v2-q8_0.gguf \
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
```
@@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
```bash
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
./bin/main -m open-llama-3b-v2-q8_0.gguf \
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin

View File

@@ -2,7 +2,7 @@
cd `dirname $0`
cd ../..
EXE="./llama-finetune"
EXE="./finetune"
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi

Some files were not shown because too many files have changed in this diff Show More