Compare commits

..

3 Commits

Author SHA1 Message Date
Georgi Gerganov
072e0a4d3b scipts : add LICENSE and gen-authors.sh to sync 2024-04-09 09:19:33 +03:00
Georgi Gerganov
0e0d4e821f authors : update 2024-04-09 09:17:25 +03:00
Georgi Gerganov
805d705032 license : add AUTHORS 2024-03-31 10:37:39 +03:00
463 changed files with 61326 additions and 102813 deletions

View File

@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
apt-get install -y build-essential python3 python3-pip git
COPY requirements.txt requirements.txt
COPY requirements requirements
@@ -28,9 +28,7 @@ COPY . .
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
RUN make -j$(nproc)
RUN make
ENTRYPOINT ["/app/.devops/tools.sh"]

View File

@@ -40,11 +40,6 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
# Enable cURL
ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
RUN make -j$(nproc)
RUN make
ENTRYPOINT ["/app/.devops/tools.sh"]

View File

@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
apt-get install -y build-essential python3 python3-pip git
COPY requirements.txt requirements.txt
COPY requirements requirements
@@ -15,10 +15,7 @@ WORKDIR /app
COPY . .
ENV LLAMA_CURL=1
RUN make -j$(nproc)
RUN make
ENV LC_ALL=C.utf8

View File

@@ -23,7 +23,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV LLAMA_CUDA=1
RUN make -j$(nproc)
RUN make
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

View File

@@ -2,14 +2,6 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
@@ -18,12 +10,14 @@ WORKDIR /app
COPY . .
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
RUN mkdir build && \
cd build && \
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build build --config Release --target main
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build . --config Release --target main
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

View File

@@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make -j$(nproc)
RUN make
ENTRYPOINT [ "/app/main" ]

View File

@@ -14,8 +14,10 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
# Build it
WORKDIR /app
COPY . .
RUN cmake -B build -DLLAMA_VULKAN=1 && \
cmake --build build --config Release --target main
RUN mkdir build && \
cd build && \
cmake .. -DLLAMA_VULKAN=1 && \
cmake --build . --config Release --target main
# Clean up
WORKDIR /

View File

@@ -9,7 +9,7 @@ WORKDIR /app
COPY . .
RUN make -j$(nproc)
RUN make
FROM ubuntu:$UBUNTU_VERSION as runtime

View File

@@ -214,6 +214,7 @@ effectiveStdenv.mkDerivation (
(cmakeBool "LLAMA_CUDA" useCuda)
(cmakeBool "LLAMA_HIPBLAS" useRocm)
(cmakeBool "LLAMA_METAL" useMetalKit)
(cmakeBool "LLAMA_MPI" useMpi)
(cmakeBool "LLAMA_VULKAN" useVulkan)
(cmakeBool "LLAMA_STATIC" enableStatic)
]
@@ -226,20 +227,20 @@ effectiveStdenv.mkDerivation (
)
]
++ optionals useRocm [
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
(cmakeFeature "CMAKE_C_COMPILER" "hipcc")
(cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
# in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
# and select the line that matches the current nixpkgs version of rocBLAS.
# Should likely use `rocmPackages.clr.gpuTargets`.
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
]
++ optionals useMetalKit [
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
(cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
];
# Environment variables needed for ROCm
env = optionals useRocm {
ROCM_PATH = "${rocmPackages.clr}";
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
};
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
# if they haven't been added yet.
postInstall = ''

View File

@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev
apt-get install -y build-essential git
WORKDIR /app
@@ -22,16 +22,11 @@ COPY . .
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
RUN make -j$(nproc)
RUN make
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
COPY --from=build /app/server /server
ENTRYPOINT [ "/server" ]

View File

@@ -2,42 +2,25 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
apt-get install -y git
WORKDIR /app
COPY . .
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
RUN mkdir build && \
cd build && \
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release --target server
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build . --config Release --target server
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
COPY --from=build /app/build/bin/server /server
ENV LC_ALL=C.utf8

View File

@@ -40,11 +40,6 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
# Enable cURL
ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
RUN make -j$(nproc)
RUN make
ENTRYPOINT [ "/app/server" ]

View File

@@ -11,15 +11,13 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
apt update -y && \
apt-get install -y vulkan-sdk
# Install cURL
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
# Build it
WORKDIR /app
COPY . .
RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
cmake --build build --config Release --target server
RUN mkdir build && \
cd build && \
cmake .. -DLLAMA_VULKAN=1 && \
cmake --build . --config Release --target server
# Clean up
WORKDIR /

View File

@@ -3,21 +3,16 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev
apt-get install -y build-essential git
WORKDIR /app
COPY . .
ENV LLAMA_CURL=1
RUN make -j$(nproc)
RUN make
FROM ubuntu:$UBUNTU_VERSION as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
COPY --from=build /app/server /server
ENV LC_ALL=C.utf8

View File

@@ -8,7 +8,7 @@ arg1="$1"
shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert-hf-to-gguf.py "$@"
python3 ./convert.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then

16
.flake8
View File

@@ -1,17 +1,3 @@
[flake8]
max-line-length = 125
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
exclude =
# Do not traverse examples
examples,
# Do not include package initializers
__init__.py,
# No need to traverse our git directory
.git,
# There's no value in checking cache directories
__pycache__,
# No need to include the build path
build,
# This contains builds that we don't want to check
dist # This is generated with `python build .` for package releases
# max-complexity = 10
ignore = W503

View File

@@ -1,50 +0,0 @@
name: Low Severity Bugs
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
title: "Bug: "
labels: ["bug-unconfirmed", "low severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View File

@@ -1,50 +0,0 @@
name: Medium Severity Bug
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
title: "Bug: "
labels: ["bug-unconfirmed", "medium severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View File

@@ -1,50 +0,0 @@
name: High Severity Bug
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
title: "Bug: "
labels: ["bug-unconfirmed", "high severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View File

@@ -1,50 +0,0 @@
name: Critical Severity Bug
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
title: "Bug: "
labels: ["bug-unconfirmed", "critical severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View File

@@ -1,51 +0,0 @@
name: Enhancement
description: Used to request enhancements for llama.cpp
title: "Feature Request: "
labels: ["enhancement"]
body:
- type: markdown
attributes:
value: |
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
- type: checkboxes
id: prerequisites
attributes:
label: Prerequisites
description: Please confirm the following before submitting your enhancement request.
options:
- label: I am running the latest code. Mention the version if possible as well.
required: true
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
required: true
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
required: true
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
required: true
- type: textarea
id: feature-description
attributes:
label: Feature Description
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
placeholder: Detailed description of the enhancement
validations:
required: true
- type: textarea
id: motivation
attributes:
label: Motivation
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
placeholder: Explanation of why this feature is needed and its benefits
validations:
required: true
- type: textarea
id: possible-implementation
attributes:
label: Possible Implementation
description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
placeholder: Detailed description of potential implementation
validations:
required: false

View File

@@ -1,52 +0,0 @@
name: Research
description: Track new technical research area
title: "Research: "
labels: ["research 🔬"]
body:
- type: markdown
attributes:
value: |
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
- type: checkboxes
id: research-stage
attributes:
label: Research Stage
description: Track general state of this research ticket
options:
- label: Background Research (Let's try to avoid reinventing the wheel)
- label: Hypothesis Formed (How do you think this will work and it's effect?)
- label: Strategy / Implementation Forming
- label: Analysis of results
- label: Debrief / Documentation (So people in the future can learn from us)
- type: textarea
id: background
attributes:
label: Previous existing literature and research
description: Whats the current state of the art and whats the motivation for this research?
- type: textarea
id: hypothesis
attributes:
label: Hypothesis
description: How do you think this will work and it's effect?
- type: textarea
id: implementation
attributes:
label: Implementation
description: Got an approach? e.g. a PR ready to go?
- type: textarea
id: analysis
attributes:
label: Analysis
description: How does the proposed implementation behave?
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View File

@@ -1,28 +0,0 @@
name: Refactor (Maintainers)
description: Used to track refactoring opportunities
title: "Refactor: "
labels: ["refactor"]
body:
- type: markdown
attributes:
value: |
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
- type: textarea
id: background-description
attributes:
label: Background Description
description: Please provide a detailed written description of the pain points you are trying to solve.
placeholder: Detailed description behind your motivation to request refactor
validations:
required: true
- type: textarea
id: possible-approaches
attributes:
label: Possible Refactor Approaches
description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
placeholder: Your idea of possible refactoring opportunity/approaches
validations:
required: false

11
.github/ISSUE_TEMPLATE/bug.md vendored Normal file
View File

@@ -0,0 +1,11 @@
---
name: Bug template
about: Used to report bugs in llama.cpp
labels: ["bug-unconfirmed"]
assignees: ''
---
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).

View File

@@ -1,13 +0,0 @@
blank_issues_enabled: true
contact_links:
- name: Got an idea?
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
about: Pop it there. It may then become an enhancement ticket.
- name: Got a question?
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
about: Ask a question there!
- name: Want to contribute?
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
about: Head to the contribution guide page of the wiki for areas you can help with

28
.github/ISSUE_TEMPLATE/enhancement.md vendored Normal file
View File

@@ -0,0 +1,28 @@
---
name: Enhancement template
about: Used to request enhancements for llama.cpp
labels: ["enhancement"]
assignees: ''
---
# Prerequisites
Please answer the following questions for yourself before submitting an issue.
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
# Feature Description
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
# Motivation
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
# Possible Implementation
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.

90
.github/labeler.yml vendored
View File

@@ -1,90 +0,0 @@
# https://github.com/actions/labeler
Kompute:
- changed-files:
- any-glob-to-any-file:
- ggml-kompute.h
- ggml-kompute.cpp
- README-kompute.md
Apple Metal:
- changed-files:
- any-glob-to-any-file:
- ggml-metal.h
- ggml-metal.cpp
- README-metal.md
SYCL:
- changed-files:
- any-glob-to-any-file:
- ggml-sycl.h
- ggml-sycl.cpp
- README-sycl.md
Nvidia GPU:
- changed-files:
- any-glob-to-any-file:
- ggml-cuda.h
- ggml-cuda/**
Vulkan:
- changed-files:
- any-glob-to-any-file:
- ggml_vk_generate_shaders.py
- ggml-vulkan*
documentation:
- changed-files:
- any-glob-to-any-file:
- docs/**
- media/**
testing:
- changed-files:
- any-glob-to-any-file:
- tests/**
build:
- changed-files:
- any-glob-to-any-file:
- cmake/**
- CMakeLists.txt
- CMakePresets.json
- codecov.yml
examples:
- changed-files:
- any-glob-to-any-file: examples/**
devops:
- changed-files:
- any-glob-to-any-file:
- .devops/**
- .github/**
- ci/**
python:
- changed-files:
- any-glob-to-any-file:
- "**/*.py"
- requirements/**
- gguf-py/**
- .flake8
script:
- changed-files:
- any-glob-to-any-file:
- scripts/**
android:
- changed-files:
- any-glob-to-any-file:
- examples/llama.android/**
server:
- changed-files:
- any-glob-to-any-file:
- examples/server/**
ggml:
- changed-files:
- any-glob-to-any-file:
- ggml.c
- ggml.h
- ggml-*.c
- ggml-*.h
- ggml-cuda/**
nix:
- changed-files:
- any-glob-to-any-file:
- "**/*.nix"
- .github/workflows/nix-*.yml
- .devops/nix/nixpkgs-instances.nix
embedding:
- changed-files:
- any-glob-to-any-file: examples/embedding/

View File

@@ -24,15 +24,15 @@ on:
push:
branches:
- master
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
pull_request_target:
types: [opened, synchronize, reopened]
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
schedule:
- cron: '04 2 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
@@ -42,33 +42,11 @@ jobs:
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
N_USERS: 8
DURATION: 10m
strategy:
matrix:
model: [phi-2]
ftype: [q4_0, q8_0, f16]
include:
- model: phi-2
ftype: q4_0
pr_comment_enabled: "true"
if: |
inputs.gpu-series == 'Standard_NC4as_T4_v3'
|| (
github.event_name == 'schedule'
&& github.ref_name == 'master'
&& github.repository_owner == 'ggerganov'
)
|| github.event_name == 'pull_request_target'
|| (
github.event_name == 'push'
&& github.event.ref == 'refs/heads/master'
&& github.repository_owner == 'ggerganov'
)
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
@@ -91,24 +69,20 @@ jobs:
sleep 0.1
done
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.21'
- name: Install k6 and xk6-sse
- name: Install k6
id: k6_installation
run: |
cd examples/server/bench
go install go.k6.io/xk6/cmd/xk6@latest
xk6 build master \
--with github.com/phymbert/xk6-sse
wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
tar xzf k6*.tar.gz --strip-components=1
- name: Build
id: cmake_build
run: |
set -eux
cmake -B build \
mkdir build
cd build
cmake .. \
-DLLAMA_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
@@ -119,7 +93,7 @@ jobs:
-DLLAMA_FATAL_WARNINGS=OFF \
-DLLAMA_ALL_WARNINGS=OFF \
-DCMAKE_BUILD_TYPE=Release;
cmake --build build --config Release -j $(nproc) --target server
cmake --build . --config Release -j $(nproc) --target server
- name: Download the dataset
id: download_dataset
@@ -134,7 +108,7 @@ jobs:
cd examples/server/bench
source venv/bin/activate
python bench.py \
BENCH_K6_BIN_PATH=./k6 python bench.py \
--runner-label ${{ env.RUNNER_LABEL }} \
--name ${{ github.job }} \
--branch ${{ github.head_ref || github.ref_name }} \
@@ -142,7 +116,7 @@ jobs:
--scenario script.js \
--duration ${{ github.event.inputs.duration || env.DURATION }} \
--hf-repo ggml-org/models \
--hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
--hf-file phi-2/ggml-model-q4_0.gguf \
--model-path-prefix /models \
--parallel ${{ env.N_USERS }} \
-ngl 33 \
@@ -160,7 +134,7 @@ jobs:
- uses: actions/upload-artifact@v4
with:
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
name: benchmark-results
compression-level: 9
path: |
examples/server/bench/*.jpg
@@ -172,7 +146,7 @@ jobs:
with:
authToken: ${{secrets.GITHUB_TOKEN}}
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
context: bench-server-baseline
description: |
${{ env.BENCH_RESULTS }}
state: 'success'
@@ -229,26 +203,21 @@ jobs:
- name: Comment PR
uses: mshick/add-pr-comment@v2
id: comment_pr
if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
if: ${{ github.event.pull_request != '' }}
with:
message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
message: |
<p align="center">
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
</p>
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
- ${{ env.BENCH_GRAPH_XLABEL }}
<details>
<summary>Expand details for performance related PR only</summary>
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
- ${{ env.BENCH_GRAPH_XLABEL }}
<summary>Time series</summary>
<p align="center">

View File

@@ -16,7 +16,7 @@ on:
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
@@ -31,9 +31,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
uses: actions/checkout@v3
- name: Dependencies
id: depends
@@ -54,7 +52,7 @@ jobs:
id: cmake_test
run: |
cd build
ctest -L 'main|curl' --verbose --timeout 900
ctest -L main --verbose --timeout 900
- name: Determine tag name
id: tag
@@ -78,10 +76,10 @@ jobs:
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
name: llama-bin-macos-arm64.zip
path: |
llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
macOS-latest-cmake-x64:
runs-on: macos-latest
@@ -89,9 +87,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
uses: actions/checkout@v3
- name: Dependencies
id: depends
@@ -105,9 +101,7 @@ jobs:
sysctl -a
mkdir build
cd build
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@@ -138,10 +132,10 @@ jobs:
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
name: llama-bin-macos-x64.zip
path: |
llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
ubuntu-focal-make:
runs-on: ubuntu-20.04
@@ -152,7 +146,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
id: depends
@@ -164,7 +158,7 @@ jobs:
with:
node-version: "20"
- uses: actions/setup-python@v5
- uses: actions/setup-python@v4
with:
python-version: "3.11"
@@ -187,7 +181,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
id: depends
@@ -209,29 +203,27 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
uses: actions/checkout@v3
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
sudo apt-get install build-essential
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
cmake .. -DLLAMA_FATAL_WARNINGS=ON
cmake --build . --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L 'main|curl' --verbose --timeout 900
ctest -L main --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
@@ -244,90 +236,67 @@ jobs:
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
# ubuntu-latest-cmake-sanitizer:
# runs-on: ubuntu-latest
#
# continue-on-error: true
#
# strategy:
# matrix:
# sanitizer: [ADDRESS, THREAD, UNDEFINED]
# build_type: [Debug, Release]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v3
#
# - name: Dependencies
# id: depends
# run: |
# sudo apt-get update
# sudo apt-get install build-essential
#
# - name: Build
# id: cmake_build
# run: |
# mkdir build
# cd build
# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
# cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
#
# - name: Test
# id: cmake_test
# run: |
# cd build
# ctest -L main --verbose --timeout 900
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
name: llama-bin-ubuntu-x64.zip
ubuntu-latest-cmake-sanitizer:
ubuntu-latest-cmake-mpi:
runs-on: ubuntu-latest
continue-on-error: true
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release]
mpi_library: [mpich, libopenmpi-dev]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install build-essential ${{ matrix.mpi_library }}
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ubuntu-latest-cmake-rpc:
runs-on: ubuntu-latest
continue-on-error: true
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake -DLLAMA_RPC=ON ..
cmake -DLLAMA_MPI=ON ..
cmake --build . --config Release -j $(nproc)
- name: Test
@@ -342,7 +311,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
id: depends
@@ -358,33 +327,6 @@ jobs:
cmake -DLLAMA_VULKAN=ON ..
cmake --build . --config Release -j $(nproc)
ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04
container: rocm/dev-ubuntu-22.04:6.0.2
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
- name: Build with native CMake HIP support
id: cmake_build
run: |
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
cmake --build build --config Release -j $(nproc)
- name: Build with legacy HIP support
id: cmake_build_legacy_hip
run: |
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
cmake --build build2 --config Release -j $(nproc)
ubuntu-22-cmake-sycl:
runs-on: ubuntu-22.04
@@ -415,7 +357,7 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Build
id: cmake_build
@@ -456,7 +398,7 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Build
id: cmake_build
@@ -476,7 +418,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
id: depends
@@ -507,7 +449,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
id: depends
@@ -616,63 +558,6 @@ jobs:
run: |
make swift
windows-msys2:
runs-on: windows-latest
strategy:
fail-fast: false
matrix:
include:
- { sys: UCRT64, env: ucrt-x86_64, build: Release }
- { sys: CLANG64, env: clang-x86_64, build: Release }
steps:
- name: Clone
uses: actions/checkout@v4
- name: Setup ${{ matrix.sys }}
uses: msys2/setup-msys2@v2
with:
update: true
msystem: ${{matrix.sys}}
install: >-
base-devel
mingw-w64-${{matrix.env}}-toolchain
mingw-w64-${{matrix.env}}-cmake
mingw-w64-${{matrix.env}}-openblas
- name: Build using make
shell: msys2 {0}
run: |
make -j $(nproc)
- name: Clean after building using make
shell: msys2 {0}
run: |
make clean
- name: Build using make w/ OpenBLAS
shell: msys2 {0}
run: |
make LLAMA_OPENBLAS=1 -j $(nproc)
- name: Build using CMake
shell: msys2 {0}
run: |
cmake -B build
cmake --build build --config ${{ matrix.build }} -j $(nproc)
- name: Clean after building using CMake
shell: msys2 {0}
run: |
rm -rf build
- name: Build using CMake w/ OpenBLAS
shell: msys2 {0}
run: |
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake:
runs-on: windows-latest
@@ -686,45 +571,41 @@ jobs:
strategy:
matrix:
include:
- build: 'rpc-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
- build: 'noavx-x64'
- build: 'noavx'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx2-x64'
- build: 'avx2'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'avx-x64'
- build: 'avx'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512-x64'
- build: 'avx512'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast-x64'
- build: 'clblast'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas-x64'
- build: 'openblas'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute-x64'
- build: 'kompute'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
- build: 'vulkan-x64'
- build: 'vulkan'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
- build: 'llvm-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'msvc-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'arm64'
defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Clone Kompute submodule
id: clone_kompute
if: ${{ matrix.build == 'kompute-x64' }}
if: ${{ matrix.build == 'kompute' }}
run: |
git submodule update --init kompute
- name: Download OpenCL SDK
id: get_opencl
if: ${{ matrix.build == 'clblast-x64' }}
if: ${{ matrix.build == 'clblast' }}
run: |
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
mkdir $env:RUNNER_TEMP/opencl
@@ -732,7 +613,7 @@ jobs:
- name: Download CLBlast
id: get_clblast
if: ${{ matrix.build == 'clblast-x64' }}
if: ${{ matrix.build == 'clblast' }}
run: |
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
@@ -745,7 +626,7 @@ jobs:
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas-x64' }}
if: ${{ matrix.build == 'openblas' }}
run: |
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
@@ -758,41 +639,38 @@ jobs:
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
- name: Install Ninja
id: install_ninja
run: |
choco install ninja
- name: Build
id: cmake_build
run: |
cmake -S . -B build ${{ matrix.defines }}
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
mkdir build
cd build
cmake .. ${{ matrix.defines }}
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Add clblast.dll
id: add_clblast_dll
if: ${{ matrix.build == 'clblast-x64' }}
if: ${{ matrix.build == 'clblast' }}
run: |
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
- name: Add libopenblas.dll
id: add_libopenblas_dll
if: ${{ matrix.build == 'openblas-x64' }}
if: ${{ matrix.build == 'openblas' }}
run: |
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
- name: Check AVX512F support
id: check_avx512f
if: ${{ matrix.build == 'avx512-x64' }}
if: ${{ matrix.build == 'avx512' }}
continue-on-error: true
run: |
cd build
@@ -806,14 +684,14 @@ jobs:
- name: Test
id: cmake_test
# not all machines have native AVX-512
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
run: |
cd build
ctest -L main -C Release --verbose --timeout 900
- name: Test (Intel SDE)
id: cmake_test_sde
if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
run: |
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
# for some weird reason windows tar doesn't like sde tar.xz
@@ -841,14 +719,14 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
name: llama-bin-win-${{ matrix.build }}.zip
path: |
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
windows-latest-cmake-cuda:
runs-on: windows-latest
@@ -861,7 +739,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
@@ -901,10 +779,10 @@ jobs:
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
path: |
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
- name: Copy and pack Cuda runtime
run: |
@@ -915,10 +793,10 @@ jobs:
- name: Upload Cuda runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
path: |
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
windows-latest-cmake-sycl:
runs-on: windows-latest
@@ -928,13 +806,13 @@ jobs:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
@@ -962,63 +840,21 @@ jobs:
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done"
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip
windows-latest-cmake-hip:
runs-on: windows-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Install
id: depends
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP SDK installation"
- name: Verify ROCm
id: verify
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- name: Build
id: cmake_build
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
cmake --build build --config Release
path: |
llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
ios-xcode-build:
runs-on: macos-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
@@ -1028,7 +864,7 @@ jobs:
steps:
- name: Clone
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Set up JDK
uses: actions/setup-java@v3
@@ -1051,7 +887,7 @@ jobs:
# runs-on: macos-12
# steps:
# - name: Clone
# uses: actions/checkout@v4
# uses: actions/checkout@v3
#
# - name: Build
# uses: cross-platform-actions/action@v0.19.0
@@ -1082,7 +918,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
@@ -1101,13 +937,7 @@ jobs:
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v4
with:
path: ./artifact
- name: Move artifacts
id: move_artifacts
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
uses: actions/download-artifact@v3
- name: Create release
id: create_release
@@ -1126,7 +956,7 @@ jobs:
const path = require('path');
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
for (let file of await fs.readdirSync('./artifact/release')) {
for (let file of await fs.readdirSync('./artifact')) {
if (path.extname(file) === '.zip') {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
@@ -1134,7 +964,7 @@ jobs:
repo: context.repo.repo,
release_id: release_id,
name: file,
data: await fs.readFileSync(`./artifact/release/${file}`)
data: await fs.readFileSync(`./artifact/${file}`)
});
}
}
@@ -1148,7 +978,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
# uses: actions/checkout@v3
#
# - name: Dependencies
# run: |
@@ -1172,7 +1002,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
# uses: actions/checkout@v3
#
# - name: Dependencies
# run: |
@@ -1196,7 +1026,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
# uses: actions/checkout@v3
#
# - name: Dependencies
# run: |
@@ -1226,7 +1056,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
# uses: actions/checkout@v3
#
# - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1
@@ -1242,7 +1072,7 @@ jobs:
# msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
#
# - name: Upload binaries
# uses: actions/upload-artifact@v4
# uses: actions/upload-artifact@v1
# with:
# name: llama-bin-${{ matrix.arch }}
# path: build/bin/${{ matrix.build }}
@@ -1265,7 +1095,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
# uses: actions/checkout@v3
#
# - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1
@@ -1297,7 +1127,7 @@ jobs:
#
# - name: Upload binaries
# if: matrix.blas == 'ON'
# uses: actions/upload-artifact@v4
# uses: actions/upload-artifact@v1
# with:
# name: llama-blas-bin-${{ matrix.arch }}
# path: build/bin/${{ matrix.build }}
@@ -1311,7 +1141,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
# uses: actions/checkout@v3
#
# - name: Dependencies
# run: |

View File

@@ -12,7 +12,7 @@ jobs:
steps:
- uses: actions/stale@v5
with:
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
exempt-issue-labels: "refactor,help wanted,good first issue,research"
days-before-issue-stale: 30
days-before-issue-close: 14
stale-issue-label: "stale"

View File

@@ -6,7 +6,7 @@ env:
GGML_N_THREADS: 1
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
@@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-20.04
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
run: |

View File

@@ -16,7 +16,7 @@ on:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
@@ -46,7 +46,7 @@ jobs:
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
steps:
- name: Check out the repo
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
@@ -91,12 +91,6 @@ jobs:
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Downcase github.repository_owner
run: |
echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
env:
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
- name: Build and push Docker image (versioned)
if: github.event_name == 'push'
uses: docker/build-push-action@v4
@@ -104,7 +98,7 @@ jobs:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
- name: Build and push Docker image (tagged)
@@ -113,5 +107,5 @@ jobs:
context: .
push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
file: ${{ matrix.config.dockerfile }}

View File

@@ -15,13 +15,13 @@ on:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
editorconfig:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- uses: editorconfig-checker/action-editorconfig-checker@main
- run: editorconfig-checker

View File

@@ -24,9 +24,9 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v5
uses: actions/setup-python@v2
with:
python-version: '3.9.x'
- name: Install dependencies

View File

@@ -1,17 +0,0 @@
name: "Pull Request Labeler"
on:
- pull_request_target
jobs:
labeler:
permissions:
contents: read
pull-requests: write
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
repository: "ggerganov/llama.cpp"
- uses: actions/labeler@v5
with:
configuration-path: '.github/labeler.yml'

View File

@@ -18,7 +18,7 @@ on:
paths: ['**/*.nix', 'flake.lock']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:

View File

@@ -9,7 +9,7 @@ on:
types: [opened, synchronize, reopened]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:

View File

@@ -17,7 +17,7 @@ on:
- 'requirements/*.txt'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
@@ -26,9 +26,9 @@ jobs:
name: check-requirements
steps:
- name: Check out source repository
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Set up Python environment
uses: actions/setup-python@v5
uses: actions/setup-python@v4
with:
python-version: "3.11"
- name: Run check-requirements.sh script

View File

@@ -3,7 +3,7 @@ name: flake8 Lint
on: [push, pull_request]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
@@ -12,12 +12,13 @@ jobs:
name: Lint
steps:
- name: Check out source repository
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Set up Python environment
uses: actions/setup-python@v5
uses: actions/setup-python@v4
with:
python-version: "3.11"
- name: flake8 Lint
uses: py-actions/flake8@v2
with:
plugins: "flake8-no-print"
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
exclude: "examples/*,examples/*/**,*/**/__init__.py"

View File

@@ -4,10 +4,6 @@ name: Server
on:
workflow_dispatch: # allows manual triggering
inputs:
sha:
description: 'Commit SHA1 to build'
required: false
type: string
slow_tests:
description: 'Run slow tests'
required: true
@@ -15,15 +11,15 @@ on:
push:
branches:
- master
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
pull_request_target:
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
schedule:
- cron: '2 4 * * *'
- cron: '0 0 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
@@ -32,76 +28,62 @@ jobs:
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [RelWithDebInfo]
# TODO: temporary disabled due to linux kernel issues
#sanitizer: [ADDRESS, THREAD, UNDEFINED]
sanitizer: [UNDEFINED]
build_type: [Debug]
include:
- build_type: Release
sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
container:
image: ubuntu:latest
ports:
- 8888
options: --cpus 4
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get -y install \
apt-get update
apt-get -y install \
build-essential \
xxd \
git \
cmake \
curl \
python3-pip \
wget \
language-pack-en \
libcurl4-openssl-dev
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Python setup
id: setup_python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. \
-DLLAMA_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r examples/server/tests/requirements.txt
- name: Verify server deps
id: verify_server_deps
run: |
git config --global --add safe.directory $(realpath .)
cd examples/server
git ls-files --others --modified
git status
./deps.sh
git status
not_ignored_files="$(git ls-files --others --modified)"
echo "Modified files: ${not_ignored_files}"
if [ -n "${not_ignored_files}" ]; then
echo "Repository is dirty or server deps are not built as expected"
echo "${not_ignored_files}"
exit 1
fi
- name: Build
id: cmake_build
run: |
cmake -B build \
-DLLAMA_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
- name: Tests
id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: |
cd examples/server/tests
PORT=8888 ./tests.sh
@@ -120,10 +102,9 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: libCURL
id: get_libcurl
@@ -137,8 +118,10 @@ jobs:
- name: Build
id: cmake_build
run: |
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
mkdir build
cd build
cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
- name: Python setup
id: setup_python

29
.github/workflows/zig-build.yml vendored Normal file
View File

@@ -0,0 +1,29 @@
name: Zig CI
on:
pull_request:
push:
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
build:
strategy:
fail-fast: false
matrix:
runs-on: [ubuntu-latest, macos-latest, windows-latest]
runs-on: ${{ matrix.runs-on }}
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
fetch-depth: 0
- uses: goto-bus-stop/setup-zig@v2
with:
version: 0.11.0
- name: Build Summary
run: zig build --summary all -freference-trace

21
.gitignore vendored
View File

@@ -2,7 +2,6 @@
*.a
*.so
*.gguf
*.gguf.json
*.bin
*.exe
*.dll
@@ -35,7 +34,6 @@ lcov-report/
gcovr-report/
build*
!build.zig
cmake-build-*
out/
tmp/
@@ -50,7 +48,6 @@ models-mnt
/convert-llama2c-to-ggml
/embd-input-test
/embedding
/eval-callback
/gguf
/gguf-llama-simple
/gguf-split
@@ -102,25 +99,7 @@ qnt-*.txt
perf-*.txt
examples/jeopardy/results.txt
examples/server/*.html.hpp
examples/server/*.js.hpp
examples/server/*.mjs.hpp
poetry.lock
poetry.toml
nppBackup
# Test binaries
/tests/test-grammar-parser
/tests/test-llama-grammar
/tests/test-double-float
/tests/test-grad0
/tests/test-opt
/tests/test-quantize-fns
/tests/test-quantize-perf
/tests/test-sampling
/tests/test-tokenizer-0
/tests/test-tokenizer-1-spm
/tests/test-tokenizer-1-bpe
/tests/test-rope
/tests/test-backend-ops

View File

@@ -3,14 +3,13 @@
exclude: prompts/.*.txt
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
rev: v3.2.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/PyCQA/flake8
rev: 7.0.0
rev: 6.0.0
hooks:
- id: flake8
additional_dependencies: [flake8-no-print]

View File

@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
project("llama.cpp" C CXX)
include(CheckIncludeFileCXX)
@@ -43,8 +43,6 @@ else()
set(LLAMA_METAL_DEFAULT OFF)
endif()
set(LLAMA_LLAMAFILE_DEFAULT ON)
# general
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
option(LLAMA_STATIC "llama: static link libraries" OFF)
@@ -72,13 +70,11 @@ else()
set(INS_ENB ON)
endif()
option(LLAMA_SVE "llama: enable SVE" OFF)
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
option(LLAMA_AVX512_BF16 "llama: enable AVX512-BF16" OFF)
option(LLAMA_FMA "llama: enable FMA" ${INS_ENB})
# in MSVC F16C is implied with AVX2/AVX512
if (NOT MSVC)
@@ -92,7 +88,6 @@ endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUDA "llama: use CUDA" OFF)
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
@@ -105,9 +100,6 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access")
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for FlashAttention" OFF)
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
@@ -125,7 +117,8 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
"llama: metal minimum macOS version")
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_RPC "llama: use RPC" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
@@ -135,8 +128,6 @@ set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeli
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
option(LLAMA_LASX "llama: enable lasx" ON)
option(LLAMA_LSX "llama: enable lsx" ON)
# add perf arguments
option(LLAMA_PERF "llama: enable perf" OFF)
@@ -295,12 +286,11 @@ if (LLAMA_METAL)
${METALKIT_FRAMEWORK}
)
endif()
if (LLAMA_BLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
endif()
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
set(BLA_SIZEOF_INTEGER 8)
endif()
@@ -378,11 +368,8 @@ if (LLAMA_BLAS)
endif()
endif()
if (LLAMA_LLAMAFILE)
add_compile_definitions(GGML_USE_LLAMAFILE)
set(GGML_HEADERS_LLAMAFILE sgemm.h)
set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
if (LLAMA_QKK_64)
add_compile_definitions(GGML_QKK_64)
endif()
if (LLAMA_CUBLAS)
@@ -403,20 +390,14 @@ if (LLAMA_CUDA)
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
add_compile_definitions(GGML_USE_CUDA)
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()
if (LLAMA_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()
if (LLAMA_CUDA_NO_VMM)
add_compile_definitions(GGML_CUDA_NO_VMM)
endif()
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -430,22 +411,10 @@ if (LLAMA_CUDA)
if (LLAMA_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
if (LLAMA_CUDA_FA_ALL_QUANTS)
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
else()
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
endif()
if (LLAMA_STATIC)
if (WIN32)
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
else ()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -454,11 +423,7 @@ if (LLAMA_CUDA)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
if (LLAMA_CUDA_NO_VMM)
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
@@ -479,15 +444,33 @@ if (LLAMA_CUDA)
endif()
endif()
if (LLAMA_RPC)
add_compile_definitions(GGML_USE_RPC)
if (LLAMA_MPI)
cmake_minimum_required(VERSION 3.10)
find_package(MPI)
if (MPI_C_FOUND)
message(STATUS "MPI found")
if (WIN32)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
set(GGML_HEADERS_MPI ggml-mpi.h)
set(GGML_SOURCES_MPI ggml-mpi.c)
add_compile_definitions(GGML_USE_MPI)
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
if (NOT MSVC)
add_compile_options(-Wno-cast-qual)
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
# Even if you're only using the C header, C++ programs may bring in MPI
# C++ functions, so more linkage is needed
if (MPI_CXX_FOUND)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_CXX_LIBRARIES})
endif()
else()
message(WARNING "MPI not found")
endif()
set(GGML_HEADERS_RPC ggml-rpc.h)
set(GGML_SOURCES_RPC ggml-rpc.cpp)
endif()
if (LLAMA_CLBLAST)
@@ -516,12 +499,6 @@ if (LLAMA_VULKAN)
add_compile_definitions(GGML_USE_VULKAN)
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
# Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
endif()
if (LLAMA_VULKAN_CHECK_RESULTS)
add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
endif()
@@ -545,37 +522,16 @@ if (LLAMA_VULKAN)
endif()
if (LLAMA_HIPBLAS)
if ($ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH})
else()
set(ROCM_PATH /opt/rocm)
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
# CMake on Windows doesn't support the HIP language yet
if(WIN32)
set(CXX_IS_HIPCC TRUE)
else()
string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
endif()
if(CXX_IS_HIPCC)
if(LINUX)
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
endif()
message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
" Prefer setting the HIP compiler directly. See README for details.")
endif()
else()
# Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
endif()
cmake_minimum_required(VERSION 3.21)
enable_language(HIP)
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
endif()
find_package(hip REQUIRED)
find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED)
@@ -586,8 +542,6 @@ if (LLAMA_HIPBLAS)
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
@@ -607,35 +561,17 @@ if (LLAMA_HIPBLAS)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
if (LLAMA_CUDA_FA_ALL_QUANTS)
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
else()
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
endif()
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
if (CXX_IS_HIPCC)
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device)
else()
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
endif()
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
if (LLAMA_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
endif()
if (LLAMA_SYCL)
@@ -658,10 +594,6 @@ if (LLAMA_SYCL)
add_compile_definitions(GGML_SYCL_F16)
endif()
if (LLAMA_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_SYCL_FORCE_MMQ)
endif()
add_compile_options(-I./) #include DPCT
add_compile_options(-I/${SYCL_INCLUDE_DIR})
@@ -1042,11 +974,6 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
if (GGML_COMPILER_SUPPORT_DOTPROD)
add_compile_definitions(__ARM_FEATURE_DOTPROD)
endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@@ -1075,9 +1002,6 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
# Raspberry Pi 3, 4, Zero 2 (32-bit)
list(APPEND ARCH_FLAGS -mno-unaligned-access)
endif()
if (LLAMA_SVE)
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
endif()
endif()
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
@@ -1102,10 +1026,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif()
if (LLAMA_AVX512_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
endif()
elseif (LLAMA_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (LLAMA_AVX)
@@ -1137,9 +1057,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
if (LLAMA_AVX512_VNNI)
list(APPEND ARCH_FLAGS -mavx512vnni)
endif()
if (LLAMA_AVX512_BF16)
list(APPEND ARCH_FLAGS -mavx512bf16)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected")
@@ -1149,17 +1066,6 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
message(STATUS "loongarch64 detected")
list(APPEND ARCH_FLAGS -march=loongarch64)
if (LLAMA_LASX)
list(APPEND ARCH_FLAGS -mlasx)
endif()
if (LLAMA_LSX)
list(APPEND ARCH_FLAGS -mlsx)
endif()
else()
message(STATUS "Unknown architecture")
endif()
@@ -1245,16 +1151,15 @@ add_library(ggml OBJECT
ggml-backend.h
ggml-quants.c
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
)
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
@@ -1335,7 +1240,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
install(TARGETS ggml PUBLIC_HEADER)
@@ -1344,7 +1249,18 @@ set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}
install(TARGETS llama LIBRARY PUBLIC_HEADER)
install(
FILES convert-hf-to-gguf.py
FILES convert.py
PERMISSIONS
OWNER_READ
OWNER_WRITE
OWNER_EXECUTE
GROUP_READ
GROUP_EXECUTE
WORLD_READ
WORLD_EXECUTE
DESTINATION ${CMAKE_INSTALL_BINDIR})
install(
FILES convert-lora-to-ggml.py
PERMISSIONS
OWNER_READ
OWNER_WRITE

View File

@@ -1,49 +0,0 @@
{
"version": 4,
"configurePresets": [
{
"name": "base",
"hidden": true,
"generator": "Ninja",
"binaryDir": "${sourceDir}/build-${presetName}",
"cacheVariables": {
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
}
},
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
{
"name": "arm64-windows-msvc", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
}
},
{
"name": "arm64-windows-llvm", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
}
},
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
]
}

153
Makefile
View File

@@ -1,28 +1,16 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
# Binaries only useful for tests
TEST_TARGETS = \
tests/test-autorelease \
tests/test-backend-ops \
tests/test-double-float \
tests/test-grad0 \
tests/test-grammar-integration \
tests/test-grammar-parser \
tests/test-json-schema-to-grammar \
tests/test-llama-grammar \
tests/test-model-load-cancel \
tests/test-opt \
tests/test-quantize-fns \
tests/test-quantize-perf \
tests/test-rope \
tests/test-sampling \
tests/test-tokenizer-0 \
tests/test-tokenizer-1-bpe \
tests/test-tokenizer-1-spm
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
tests/test-json-schema-to-grammar
# Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -39,17 +27,6 @@ ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif
# In GNU make default CXX is g++ instead of c++. Let's fix that so that users
# of non-gcc compilers don't have to provide g++ alias or wrapper.
DEFCC := cc
DEFCXX := c++
ifeq ($(origin CC),default)
CC := $(DEFCC)
endif
ifeq ($(origin CXX),default)
CXX := $(DEFCXX)
endif
# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin)
@@ -72,16 +49,11 @@ default: $(BUILD_TARGETS)
test: $(TEST_TARGETS)
@failures=0; \
for test_target in $(TEST_TARGETS); do \
if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
continue; \
@@ -379,16 +351,15 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
CUDA_POWER_ARCH = 1
endif
ifneq ($(filter loongarch64%,$(UNAME_M)),)
MK_CFLAGS += -mlasx
MK_CXXFLAGS += -mlasx
endif
else
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif
ifdef LLAMA_QKK_64
MK_CPPFLAGS += -DGGML_QKK_64
endif
ifndef LLAMA_NO_ACCELERATE
# Mac OS - include Accelerate framework.
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
@@ -400,17 +371,19 @@ ifndef LLAMA_NO_ACCELERATE
endif
endif # LLAMA_NO_ACCELERATE
ifdef LLAMA_MPI
MK_CPPFLAGS += -DGGML_USE_MPI
MK_CFLAGS += -Wno-cast-qual
MK_CXXFLAGS += -Wno-cast-qual
OBJS += ggml-mpi.o
endif # LLAMA_MPI
ifdef LLAMA_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
MK_LDFLAGS += $(shell pkg-config --libs openblas)
endif # LLAMA_OPENBLAS
ifndef LLAMA_NO_LLAMAFILE
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
OBJS += sgemm.o
endif
ifdef LLAMA_BLIS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
MK_LDFLAGS += -lblis -L/usr/local/lib
@@ -421,26 +394,16 @@ ifdef LLAMA_CUBLAS
LLAMA_CUDA := 1
endif
OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
ifdef LLAMA_CUDA_FA_ALL_QUANTS
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
else
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
endif # LLAMA_CUDA_FA_ALL_QUANTS
ifdef LLAMA_CUDA
ifneq ('', '$(wildcard /opt/cuda)')
CUDA_PATH ?= /opt/cuda
else
CUDA_PATH ?= /usr/local/cuda
endif
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
OBJS += $(OBJS_CUDA_TEMP_INST)
MK_NVCCFLAGS += -use_fast_math
ifdef LLAMA_FATAL_WARNINGS
MK_NVCCFLAGS += -Werror all-warnings
@@ -451,9 +414,6 @@ endif # JETSON_EOL_MODULE_DETECT
ifdef LLAMA_DEBUG
MK_NVCCFLAGS += -lineinfo
endif # LLAMA_DEBUG
ifdef LLAMA_CUDA_DEBUG
MK_NVCCFLAGS += --device-debug
endif # LLAMA_CUDA_DEBUG
ifdef LLAMA_CUDA_NVCC
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
else
@@ -503,10 +463,7 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
endif # LLAMA_CUDA_NO_PEER_COPY
ifdef LLAMA_CUDA_CCBIN
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif # LLAMA_CUDA_CCBIN
ifdef LLAMA_CUDA_FA_ALL_QUANTS
MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
endif # LLAMA_CUDA_FA_ALL_QUANTS
endif
ifdef JETSON_EOL_MODULE_DETECT
define NVCC_COMPILE
@@ -518,14 +475,16 @@ define NVCC_COMPILE
endef # NVCC_COMPILE
endif # JETSON_EOL_MODULE_DETECT
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
$(NVCC_COMPILE)
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(NVCC_COMPILE)
endif # LLAMA_CUDA
ifdef LLAMA_CLBLAST
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
@@ -570,10 +529,10 @@ endif # LLAMA_VULKAN
ifdef LLAMA_HIPBLAS
ifeq ($(wildcard /opt/rocm),)
ROCM_PATH ?= /usr
AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
else
ROCM_PATH ?= /opt/rocm
AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
endif
HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
LLAMA_CUDA_DMMV_X ?= 32
@@ -584,9 +543,8 @@ ifdef LLAMA_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA
endif # LLAMA_HIP_UMA
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
@@ -598,12 +556,11 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
endif # LLAMA_CUDA_NO_PEER_COPY
OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
OBJS += $(OBJS_CUDA_TEMP_INST)
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
endif # LLAMA_HIPBLAS
@@ -641,10 +598,10 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
endif
endif # LLAMA_METAL
ifndef LLAMA_NO_LLAMAFILE
sgemm.o: sgemm.cpp sgemm.h ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif
ifdef LLAMA_MPI
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI
GF_CC := $(CC)
include scripts/get-flags.mk
@@ -689,7 +646,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
ifndef CUDA_DOCKER_ARCH
ifndef CUDA_POWER_ARCH
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
@@ -730,8 +687,8 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
common.o: common/common.cpp $(COMMON_H_DEPS)
$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -763,7 +720,6 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
clean:
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
rm -vrf ggml-cuda/*.o
rm -vrf ggml-cuda/template-instances/*.o
find examples pocs -type f -name "*.o" -delete
#
@@ -800,11 +756,11 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -832,19 +788,10 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
examples/server/%.hpp: examples/server/public/% Makefile
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
echo "unsigned char $${NAME}[] = {" && \
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
echo "};" && \
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
) > $@
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -853,10 +800,6 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -924,10 +867,6 @@ passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift
(cd examples/batched.swift; make build)
@@ -975,10 +914,6 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1007,7 +942,11 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1015,7 +954,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

View File

@@ -2,45 +2,6 @@
import PackageDescription
var sources = [
"ggml.c",
"sgemm.cpp",
"llama.cpp",
"unicode.cpp",
"unicode-data.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",
]
var resources: [Resource] = []
var linkerSettings: [LinkerSetting] = []
var cSettings: [CSetting] = [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.unsafeFlags(["-fno-objc-arc"]),
// NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
// .define("ACCELERATE_NEW_LAPACK"),
// .define("ACCELERATE_LAPACK_ILP64")
]
#if canImport(Darwin)
sources.append("ggml-metal.m")
resources.append(.process("ggml-metal.metal"))
linkerSettings.append(.linkedFramework("Accelerate"))
cSettings.append(
contentsOf: [
.define("GGML_USE_ACCELERATE"),
.define("GGML_USE_METAL")
]
)
#endif
#if os(Linux)
cSettings.append(.define("_GNU_SOURCE"))
#endif
let package = Package(
name: "llama",
platforms: [
@@ -67,11 +28,34 @@ let package = Package(
"ggml-cuda.h",
"Makefile"
],
sources: sources,
resources: resources,
sources: [
"ggml.c",
"llama.cpp",
"unicode.cpp",
"unicode-data.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",
"ggml-metal.m",
],
resources: [
.process("ggml-metal.metal")
],
publicHeadersPath: "spm-headers",
cSettings: cSettings,
linkerSettings: linkerSettings
cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.define("GGML_USE_ACCELERATE"),
.unsafeFlags(["-fno-objc-arc"]),
.define("GGML_USE_METAL"),
// NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
// .define("ACCELERATE_NEW_LAPACK"),
// .define("ACCELERATE_LAPACK_ILP64")
],
linkerSettings: [
.linkedFramework("Accelerate")
]
)
],
cxxLanguageStandard: .cxx11

View File

@@ -3,14 +3,14 @@
- [Background](#background)
- [News](#news)
- [OS](#os)
- [Hardware](#hardware)
- [Supported Devices](#supported-devices)
- [Docker](#docker)
- [Linux](#linux)
- [Windows](#windows)
- [Environment Variable](#environment-variable)
- [Known Issue](#known-issues)
- [Q&A](#qa)
- [TODO](#todo)
- [Known Issue](#known-issue)
- [Q&A](#q&a)
- [Todo](#todo)
## Background
@@ -24,20 +24,19 @@
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
### Llama.cpp + SYCL
This SYCL "backend" follows the same design found in other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. The oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
The llama.cpp SYCL backend supports:
- Intel GPUs.
- Nvidia GPUs.
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
*Upcoming support: AMD GPUs*.
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
When targetting **Intel CPUs**, it is recommended to use llama.cpp for [x86_64](README.md#intel-onemkl) approach.
## News
- 2024.4
- Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
- 2024.3
- Release binary files of Windows.
- A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
- New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
@@ -54,48 +53,57 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
## OS
| OS | Status | Verified |
|---------|---------|------------------------------------------------|
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
| Windows | Support | Windows 11 |
|OS|Status|Verified|
|-|-|-|
|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
|Windows|Support|Windows 11|
## Hardware
## Supported devices
### Intel GPU
### Intel GPUs
**Verified devices**
The oneAPI Math Kernel Library, which the oneAPI base-toolkit includes, supports intel GPUs. In order to make it "visible", simply run the following:
```sh
source /opt/intel/oneapi/setvars.sh
```
| Intel GPU | Status | Verified Model |
|-------------------------------|---------|---------------------------------------|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
| Intel Data Center Flex Series | Support | Flex 170 |
| Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
- **Tested devices**
|Intel GPU| Status | Verified Model|
|-|-|-|
|Intel Data Center Max Series| Support| Max 1550|
|Intel Data Center Flex Series| Support| Flex 170|
|Intel Arc Series| Support| Arc 770, 730M|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
*Notes:*
- **Memory**
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
- Device memory can be a limitation when running a large model on an intel GPU. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPUs and 4.0GB for discrete GPUs.
- **Execution Unit (EU)**
- If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
- If the iGPU has less than 80 EUs *(Execution Unit)*, the inference speed will likely be too slow for practical use.
### Other Vendor GPU
### Nvidia GPUs
The BLAS acceleration on Nvidia GPUs through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)
**Verified devices**
- **Tested devices**
|Nvidia GPU| Status | Verified Model|
|-|-|-|
|Ampere Series| Support| A100, A4000|
|Ampere Series *(Mobile)*| Support| RTX 40 Series|
*Notes:*
- Support for Nvidia targets through oneAPI is currently limited to Linux platforms.
- Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs.
| Nvidia GPU | Status | Verified Model |
|--------------------------|---------|----------------|
| Ampere Series | Support | A100, A4000 |
| Ampere Series *(Mobile)* | Support | RTX 40 Series |
## Docker
The docker build option is currently limited to *intel GPU* targets.
### Build image
```sh
# Using FP16
@@ -161,11 +169,30 @@ Platform #0: Intel(R) OpenCL HD Graphics
- **Nvidia GPU**
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
Installation can be verified by running the following:
```sh
nvidia-smi
```
Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*:
```
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03 Driver Version: 535.54.03 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA A100-PCIE-40GB On | 00000000:8D:00.0 Off | 0 |
| N/A 36C P0 57W / 250W | 4MiB / 40960MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
```
2. **Install Intel® oneAPI Base toolkit**
- **For Intel GPU**
- **Base installation**
The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
@@ -177,16 +204,17 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
- **Adding support to Nvidia GPUs**
**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
```sh
git clone https://github.com/oneapi-src/oneMKL
cd oneMKL
cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
cmake --build buildWithCublas --config Release
mkdir -p buildWithCublas && cd buildWithCublas
cmake ../ -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
make
```
@@ -211,7 +239,7 @@ When targeting an intel GPU, the user should expect one or more level-zero devic
- **Nvidia GPU**
Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
@@ -226,15 +254,13 @@ Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA devic
source /opt/intel/oneapi/setvars.sh
# Build LLAMA with MKL BLAS acceleration for intel GPU
mkdir -p build && cd build
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 1: Use FP16 for better performance in long-prompt inference
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 2: Use FP16
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# build all binary
cmake --build build --config Release -j -v
# Option 2: Use FP32 by default
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
```
#### Nvidia GPU
@@ -246,23 +272,20 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with Nvidia BLAS acceleration through SYCL
mkdir -p build && cd build
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16
cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# build all binary
cmake --build build --config Release -j -v
# Option 1: Use FP16 for better performance in long-prompt inference
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 2: Use FP32 by default
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
```
### III. Run the inference
1. Retrieve and prepare model
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
2. Enable oneAPI running environment
@@ -291,10 +314,10 @@ found 6 SYCL devices:
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
```
| Attribute | Note |
|------------------------|-------------------------------------------------------------|
| compute capability 1.3 | Level-zero driver/runtime, recommended |
| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero driver/runtime, recommended |
|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases|
4. Launch inference
@@ -303,10 +326,10 @@ There are two device selection modes:
- Single device: Use one device target specified by the user.
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
| Device selection | Parameter |
|------------------|----------------------------------------|
| Single device | --split-mode none --main-gpu DEVICE_ID |
| Multiple devices | --split-mode layer (default) |
|Device selection|Parameter|
|-|-|
|Single device|--split-mode none --main-gpu DEVICE_ID |
|Multiple devices|--split-mode layer (default)|
Examples:
@@ -335,6 +358,7 @@ Otherwise, you can run the script:
*Notes:*
- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
```sh
@@ -409,15 +433,13 @@ b. Download & install mingw-w64 make for Windows provided by w64devkit
On the oneAPI command line window, step into the llama.cpp main directory and run the following:
```
mkdir -p build
cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
# Option 2: Or FP16
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
cmake --build build --config Release -j
make
```
Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
@@ -465,10 +487,10 @@ found 6 SYCL devices:
```
| Attribute | Note |
|------------------------|-----------------------------------------------------------|
| compute capability 1.3 | Level-zero running time, recommended |
| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
4. Launch inference
@@ -478,10 +500,10 @@ There are two device selection modes:
- Single device: Use one device assigned by user.
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
| Device selection | Parameter |
|------------------|----------------------------------------|
| Single device | --split-mode none --main-gpu DEVICE_ID |
| Multiple devices | --split-mode layer (default) |
|Device selection|Parameter|
|-|-|
|Single device|--split-mode none --main-gpu DEVICE_ID |
|Multiple devices|--split-mode layer (default)|
Examples:
@@ -504,6 +526,7 @@ Otherwise, run the following wrapper script:
Note:
- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
```sh
@@ -518,23 +541,29 @@ use 1 SYCL GPUs: [0] with Max compute units:512
#### Build
| Name | Value | Function |
|--------------------|-----------------------------------|---------------------------------------------|
| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. |
| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|Name|Value|Function|
|-|-|-|
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.|
|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.|
|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.|
|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.|
|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.|
#### Runtime
| Name | Value | Function |
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|Name|Value|Function|
|-|-|-|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
## Known Issues
- Hanging during startup
llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang.
- **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable.
- `Split-mode:[row]` is not supported.
## Q&A
@@ -546,7 +575,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
- General compiler error:
- Remove **build** folder or try a clean-build.
- Remove build folder or try a clean-build.
- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
@@ -563,6 +592,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
### **GitHub contribution**:
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
## TODO
## Todo
- Support row layer split for multiple card runs.

364
README.md
View File

@@ -3,8 +3,6 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@@ -12,8 +10,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Recent API changes
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
@@ -22,16 +18,12 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Hot topics
- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
- Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187
----
@@ -98,11 +90,9 @@ Typically finetunes of the base models below are supported as well.
- [X] LLaMA 🦙
- [x] LLaMA 2 🦙🦙
- [x] LLaMA 3 🦙🦙🦙
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
- [X] Falcon
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
@@ -110,6 +100,7 @@ Typically finetunes of the base models below are supported as well.
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
@@ -124,15 +115,8 @@ Typically finetunes of the base models below are supported as well.
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
- [x] [Gemma](https://ai.google.dev/gemma)
- [x] [Mamba](https://github.com/state-spaces/mamba)
- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf)
- [x] [Xverse](https://huggingface.co/models?search=xverse)
- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r)
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo)
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
**Multimodal models:**
@@ -142,16 +126,11 @@ Typically finetunes of the base models below are supported as well.
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
**HTTP server**
[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
[simplechat](./examples/server/public_simplechat) is a simple chat client, which can be used to chat with the model exposed using above web server (use --path to point to simplechat), from a local web browser.
**Bindings:**
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
@@ -161,7 +140,6 @@ Typically finetunes of the base models below are supported as well.
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
@@ -182,7 +160,6 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [nat/openplayground](https://github.com/nat/openplayground)
- [Faraday](https://faraday.dev/) (proprietary)
- [LMStudio](https://lmstudio.ai/) (proprietary)
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
@@ -200,19 +177,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [Msty](https://msty.app) (proprietary)
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
- [Dot](https://github.com/alexpinel/Dot) (GPL)
- [MindMac](https://mindmac.app) (proprietary)
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [AIKit](https://github.com/sozercan/aikit) (MIT)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
**Tools:**
- [akx/ggify](https://github.com/akx/ggify) download PyTorch models from HuggingFace Hub and convert them to GGML
---
Here is a typical run using LLaMA v2 13B on M2 Ultra:
@@ -312,7 +279,7 @@ cd llama.cpp
### Build
In order to build llama.cpp you have four different options.
In order to build llama.cpp you have three different options.
- Using `make`:
- On Linux or MacOS:
@@ -332,37 +299,14 @@ In order to build llama.cpp you have four different options.
make
```
- Notes:
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, run `make LLAMA_DEBUG=1`
- Using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```
**Notes**:
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, there are two cases:
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
```bash
cmake -B build -DCMAKE_BUILD_TYPE=Debug
cmake --build build
```
2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
```bash
cmake -B build -G "Xcode"
cmake --build build --config Debug
```
```bash
mkdir build
cd build
cmake ..
cmake --build . --config Release
```
- Using `Zig` (version 0.11 or later):
@@ -392,14 +336,6 @@ In order to build llama.cpp you have four different options.
CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
the instructions for use and activate this options in this document below.
### Homebrew
On Mac and Linux, the homebrew package manager can be used via
```
brew install llama.cpp
```
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
### Metal Build
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
@@ -408,6 +344,45 @@ To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or th
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
argument.
### MPI Build
MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
- Using `make`:
```bash
make CC=mpicc CXX=mpicxx LLAMA_MPI=1
```
- Using `CMake`:
```bash
cmake -S . -B build -DLLAMA_MPI=ON
```
Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
Here is an example hostfile:
```
192.168.0.1:2
malvolio.local:1
```
The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
Finally, you're ready to run a computation using `mpirun`:
```bash
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
```
### BLAS Build
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
@@ -444,8 +419,10 @@ Building the program with BLAS support may lead to some performance improvements
- Using `CMake` on Linux:
```bash
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cmake --build build --config Release
mkdir build
cd build
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cmake --build . --config Release
```
- #### BLIS
@@ -465,9 +442,11 @@ Building the program with BLAS support may lead to some performance improvements
- Using manual oneAPI installation:
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
```bash
mkdir build
cd build
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
cmake --build build --config Release
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
cmake --build . --config Release
```
- Using oneAPI docker image:
@@ -488,28 +467,28 @@ Building the program with BLAS support may lead to some performance improvements
- Using `CMake`:
```bash
cmake -B build -DLLAMA_CUDA=ON
cmake --build build --config Release
mkdir build
cd build
cmake .. -DLLAMA_CUDA=ON
cmake --build . --config Release
```
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
| Option | Legal values | Default | Description |
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
| LLAMA_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
| Option | Legal values | Default | Description |
|--------------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
- #### hipBLAS
This provides BLAS acceleration on HIP-supported AMD GPUs.
Make sure to have ROCm installed.
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
- Using `make`:
```bash
@@ -517,38 +496,25 @@ Building the program with BLAS support may lead to some performance improvements
```
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build --config Release -- -j 16
```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
Note that if you get the following error:
```
clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
```
Try searching for a directory under `HIP_PATH` that contains the file
`oclc_abi_version_400.bc`. Then, add the following to the start of the
command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
like:
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \
cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build -- -j 16
```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
- Using `make` (example for target gfx1030, build with 16 CPU threads):
```bash
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
```
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
```bash
set PATH=%HIP_PATH%\bin;%PATH%
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
cmake --build build
mkdir build
cd build
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
cmake --build .
```
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
@@ -558,18 +524,18 @@ Building the program with BLAS support may lead to some performance improvements
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
| Option | Legal values | Default | Description |
|-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| Option | Legal values | Default | Description |
|-------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- #### CLBlast
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
- For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
- For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
@@ -578,14 +544,15 @@ Building the program with BLAS support may lead to some performance improvements
```sh
git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
cd OpenCL-SDK
cmake -B build -DBUILD_DOCS=OFF \
mkdir OpenCL-SDK/build
cd OpenCL-SDK/build
cmake .. -DBUILD_DOCS=OFF \
-DBUILD_EXAMPLES=OFF \
-DBUILD_TESTING=OFF \
-DOPENCL_SDK_BUILD_SAMPLES=OFF \
-DOPENCL_SDK_TEST_SAMPLES=OFF
cmake --build build
cmake --install build --prefix /some/path
cmake --build . --config Release
cmake --install . --prefix /some/path
```
</details>
@@ -593,12 +560,6 @@ Building the program with BLAS support may lead to some performance improvements
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
Linux packaging:
Fedora Linux:
```bash
sudo dnf install clblast
```
Alternatively, they may be built from source.
- <details>
@@ -607,23 +568,23 @@ Building the program with BLAS support may lead to some performance improvements
```cmd
set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
git clone https://github.com/CNugteren/CLBlast.git
cd CLBlast
cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/CLBlast
mkdir CLBlast\build
cd CLBlast\build
cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
cmake --build . --config Release
cmake --install . --prefix C:/CLBlast
```
(note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
- <details>
<summary>Unix:</summary>
```sh
git clone https://github.com/CNugteren/CLBlast.git
cd CLBlast
cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
cmake --build build --config Release
cmake --install build --prefix /some/path
mkdir CLBlast/build
cd CLBlast/build
cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
cmake --build . --config Release
cmake --install . --prefix /some/path
```
Where `/some/path` is where the built library will be installed (default is `/usr/local`).
@@ -637,17 +598,21 @@ Building the program with BLAS support may lead to some performance improvements
```
- CMake (Unix):
```sh
cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
cmake --build build --config Release
mkdir build
cd build
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
cmake --build . --config Release
```
- CMake (Windows):
```cmd
set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/LlamaCPP
mkdir build
cd build
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
cmake --build . --config Release
cmake --install . --prefix C:/LlamaCPP
```
##### Running Llama with CLBlast
@@ -703,8 +668,10 @@ Building the program with BLAS support may lead to some performance improvements
Then, build llama.cpp using the cmake command below:
```bash
cmake -B build -DLLAMA_VULKAN=1
cmake --build build --config Release
mkdir -p build
cd build
cmake .. -DLLAMA_VULKAN=1
cmake --build . --config Release
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
@@ -714,14 +681,8 @@ Building the program with BLAS support may lead to some performance improvements
### Prepare and Quantize
> [!NOTE]
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
```bash
# obtain the official LLaMA model weights and place them in ./models
ls ./models
@@ -737,10 +698,10 @@ ls ./models
python3 -m pip install -r requirements.txt
# convert the model to ggml FP16 format
python3 convert-hf-to-gguf.py models/mymodel/
python3 convert.py models/mymodel/
# [Optional] for models using BPE tokenizers
python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
python convert.py models/mymodel/ --vocab-type bpe
# quantize the model to 4-bits (using Q4_K_M method)
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
@@ -775,11 +736,11 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
| Model | Original size | Quantized size (Q4_0) |
|------:|--------------:|----------------------:|
| 7B | 13 GB | 3.9 GB |
| 13B | 24 GB | 7.8 GB |
| 30B | 60 GB | 19.5 GB |
| 65B | 120 GB | 38.5 GB |
|------:|--------------:|-----------------------:|
| 7B | 13 GB | 3.9 GB |
| 13B | 24 GB | 7.8 GB |
| 30B | 60 GB | 19.5 GB |
| 65B | 120 GB | 38.5 GB |
### Quantization
@@ -787,7 +748,7 @@ Several quantization methods are supported. They differ in the resulting model d
*(outdated)*
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |
@@ -943,25 +904,17 @@ If your issue is with model generation quality, then please at least scan the fo
### Android
#### Build on Android using Termux
[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
```
apt update && apt upgrade -y
apt install git make cmake
```
It's recommended to move your model inside the `~/` directory for best performance:
```
cd storage/downloads
mv model.gguf ~/
```
[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
#### Building the Project using Android NDK
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
First, install the essential packages for termux:
```
pkg install clang wget git cmake
```
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux.
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
```
$ mkdir build-android
$ cd build-android
@@ -969,9 +922,7 @@ $ export NDK=<your_ndk_directory>
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
$ make
```
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
@@ -993,10 +944,53 @@ $cd /data/data/com.termux/files/home/bin
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
```
Here's a demo of an interactive session running on Pixel 5 phone:
Here is a demo of an interactive session running on Pixel 5 phone:
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
#### Building the Project using Termux (F-Droid)
Termux from F-Droid offers an alternative route to execute the project on an Android device. This method empowers you to construct the project right from within the terminal, negating the requirement for a rooted device or SD Card.
Outlined below are the directives for installing the project using OpenBLAS and CLBlast. This combination is specifically designed to deliver peak performance on recent devices that feature a GPU.
If you opt to utilize OpenBLAS, you'll need to install the corresponding package.
```
apt install libopenblas
```
Subsequently, if you decide to incorporate CLBlast, you'll first need to install the requisite OpenCL packages:
```
apt install ocl-icd opencl-headers opencl-clhpp clinfo
```
In order to compile CLBlast, you'll need to first clone the respective Git repository, which can be found at this URL: https://github.com/CNugteren/CLBlast. Alongside this, clone this repository into your home directory. Once this is done, navigate to the CLBlast folder and execute the commands detailed below:
```
cmake .
make
cp libclblast.so* $PREFIX/lib
cp ./include/clblast.h ../llama.cpp
```
Following the previous steps, navigate to the LlamaCpp directory. To compile it with OpenBLAS and CLBlast, execute the command provided below:
```
cp /data/data/com.termux/files/usr/include/openblas/cblas.h .
cp /data/data/com.termux/files/usr/include/openblas/openblas_config.h .
make LLAMA_CLBLAST=1 //(sometimes you need to run this command twice)
```
Upon completion of the aforementioned steps, you will have successfully compiled the project. To run it using CLBlast, a slight adjustment is required: a command must be issued to direct the operations towards your device's physical GPU, rather than the virtual one. The necessary command is detailed below:
```
GGML_OPENCL_PLATFORM=0
GGML_OPENCL_DEVICE=0
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
```
(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.
### Docker
#### Prerequisites
@@ -1102,9 +1096,7 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
![matmul](media/matmul.png)
- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT`
### Docs

View File

@@ -1,67 +0,0 @@
# Security Policy
- [**Using llama.cpp securely**](#using-llamacpp-securely)
- [Untrusted models](#untrusted-models)
- [Untrusted inputs](#untrusted-inputs)
- [Data privacy](#data-privacy)
- [Untrusted environments or networks](#untrusted-environments-or-networks)
- [Multi-Tenant environments](#multi-tenant-environments)
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
## Using llama.cpp securely
### Untrusted models
Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
*Always execute untrusted models within a secure, isolated environment such as a sandbox* (e.g., containers, virtual machines). This helps protect your system from potentially malicious code.
> [!NOTE]
> The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
### Untrusted inputs
Some models accept various input formats (text, images, audio, etc.). The libraries converting these inputs have varying security levels, so it's crucial to isolate the model and carefully pre-process inputs to mitigate script injection risks.
For maximum security when handling untrusted inputs, you may need to employ the following:
* Sandboxing: Isolate the environment where the inference happens.
* Pre-analysis: Check how the model performs by default when exposed to prompt injection (e.g. using [fuzzing for prompt injection](https://github.com/FonduAI/awesome-prompt-injection?tab=readme-ov-file#tools)). This will give you leads on how hard you will have to work on the next topics.
* Updates: Keep both LLaMA C++ and your libraries updated with the latest security patches.
* Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as:
* Validation: Enforce strict rules on allowed characters and data types.
* Filtering: Remove potentially malicious scripts or code fragments.
* Encoding: Convert special characters into safe representations.
* Verification: Run tooling that identifies potential script injections (e.g. [models that detect prompt injection attempts](https://python.langchain.com/docs/guides/safety/hugging_face_prompt_injection)).
### Data privacy
To protect sensitive data from potential leaks or unauthorized access, it is crucial to sandbox the model execution. This means running the model in a secure, isolated environment, which helps mitigate many attack vectors.
### Untrusted environments or networks
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
* Encrypt your data if sending it over the network.
### Multi-Tenant environments
If you intend to run multiple models in parallel with shared memory, it is your responsibility to ensure the models do not interact or access each other's data. The primary areas of concern are tenant isolation, resource allocation, model sharing and hardware attacks.
1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
## Reporting a vulnerability
Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
<!-- normal version -->
However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.

142
build.zig Normal file
View File

@@ -0,0 +1,142 @@
// Compatible with Zig Version 0.11.0
const std = @import("std");
const ArrayList = std.ArrayList;
const Compile = std.Build.Step.Compile;
const ConfigHeader = std.Build.Step.ConfigHeader;
const Mode = std.builtin.Mode;
const CrossTarget = std.zig.CrossTarget;
const Maker = struct {
builder: *std.build.Builder,
target: CrossTarget,
optimize: Mode,
enable_lto: bool,
include_dirs: ArrayList([]const u8),
cflags: ArrayList([]const u8),
cxxflags: ArrayList([]const u8),
objs: ArrayList(*Compile),
fn addInclude(m: *Maker, dir: []const u8) !void {
try m.include_dirs.append(dir);
}
fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
}
fn addCFlag(m: *Maker, flag: []const u8) !void {
try m.cflags.append(flag);
}
fn addCxxFlag(m: *Maker, flag: []const u8) !void {
try m.cxxflags.append(flag);
}
fn addFlag(m: *Maker, flag: []const u8) !void {
try m.addCFlag(flag);
try m.addCxxFlag(flag);
}
fn init(builder: *std.build.Builder) !Maker {
const target = builder.standardTargetOptions(.{});
const zig_version = @import("builtin").zig_version_string;
const commit_hash = try std.ChildProcess.exec(
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
);
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
\\int LLAMA_BUILD_NUMBER = {};
\\char const *LLAMA_COMMIT = "{s}";
\\char const *LLAMA_COMPILER = "Zig {s}";
\\char const *LLAMA_BUILD_TARGET = "{s}";
\\
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
var m = Maker{
.builder = builder,
.target = target,
.optimize = builder.standardOptimizeOption(.{}),
.enable_lto = false,
.include_dirs = ArrayList([]const u8).init(builder.allocator),
.cflags = ArrayList([]const u8).init(builder.allocator),
.cxxflags = ArrayList([]const u8).init(builder.allocator),
.objs = ArrayList(*Compile).init(builder.allocator),
};
try m.addCFlag("-std=c11");
try m.addCxxFlag("-std=c++11");
try m.addProjectInclude(&.{});
try m.addProjectInclude(&.{"common"});
return m;
}
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
if (o.target.getAbi() != .msvc)
o.defineCMacro("_GNU_SOURCE", null);
if (std.mem.endsWith(u8, src, ".c")) {
o.addCSourceFiles(&.{src}, m.cflags.items);
o.linkLibC();
} else {
o.addCSourceFiles(&.{src}, m.cxxflags.items);
if (o.target.getAbi() == .msvc) {
o.linkLibC(); // need winsdk + crt
} else {
// linkLibCpp already add (libc++ + libunwind + libc)
o.linkLibCpp();
}
}
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
o.want_lto = m.enable_lto;
return o;
}
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
e.addCSourceFiles(&.{src}, m.cxxflags.items);
for (deps) |d| e.addObject(d);
for (m.objs.items) |o| e.addObject(o);
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
// https://github.com/ziglang/zig/issues/15448
if (e.target.getAbi() == .msvc) {
e.linkLibC(); // need winsdk + crt
} else {
// linkLibCpp already add (libc++ + libunwind + libc)
e.linkLibCpp();
}
m.builder.installArtifact(e);
e.want_lto = m.enable_lto;
return e;
}
};
pub fn build(b: *std.build.Builder) !void {
var make = try Maker.init(b);
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
const ggml = make.obj("ggml", "ggml.c");
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
const unicode = make.obj("unicode", "unicode.cpp");
const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
const llama = make.obj("llama", "llama.cpp");
const buildinfo = make.obj("common", "common/build-info.cpp");
const common = make.obj("common", "common/common.cpp");
const console = make.obj("console", "common/console.cpp");
const sampling = make.obj("sampling", "common/sampling.cpp");
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
const train = make.obj("train", "common/train.cpp");
const clip = make.obj("clip", "examples/llava/clip.cpp");
const llava = make.obj("llava", "examples/llava/llava.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}
}

371
ci/run.sh
View File

@@ -153,64 +153,13 @@ function gg_sum_ctest_release {
gg_printf '```\n'
}
# test_scripts_debug
function gg_run_test_scripts_debug {
cd ${SRC}
set -e
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e
}
function gg_sum_test_scripts_debug {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs test scripts in debug mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
gg_printf '```\n'
gg_printf '\n'
}
# test_scripts_release
function gg_run_test_scripts_release {
cd ${SRC}
set -e
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e
}
function gg_sum_test_scripts_release {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs test scripts in release mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
gg_printf '```\n'
gg_printf '\n'
}
function gg_get_model {
local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
if [[ -s $gguf_0 ]]; then
echo -n "$gguf_0"
elif [[ -s $gguf_1 ]]; then
echo -n "$gguf_1"
elif [[ -s $gguf_2 ]]; then
echo -n "$gguf_2"
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
if [[ -s $gguf_3b ]]; then
echo -n "$gguf_3b"
elif [[ -s $gguf_7b ]]; then
echo -n "$gguf_7b"
else
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
exit 1
@@ -259,169 +208,33 @@ function gg_sum_ctest_with_model_release {
gg_printf '```\n'
}
# open_llama_7b_v2
# requires: GG_BUILD_CUDA
# open_llama_3b_v2
function gg_run_open_llama_7b_v2 {
function gg_run_open_llama_3b_v2 {
cd ${SRC}
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
path_models="../models-mnt/open-llama/7B-v2"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test="${path_wiki}/wiki.test.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
return 20
fi
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
return 0
}
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
set +e
}
function gg_sum_open_llama_7b_v2 {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'OpenLLaMA 7B-v2:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
}
# pythia_1.4b
function gg_run_pythia_1_4b {
cd ${SRC}
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
path_models="../models-mnt/pythia/1.4B"
path_models="../models-mnt/open-llama/3B-v2"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python3 ../convert.py ${path_models}
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -474,8 +287,7 @@ function gg_run_pythia_1_4b {
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
@@ -496,7 +308,7 @@ function gg_run_pythia_1_4b {
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
@@ -504,16 +316,58 @@ function gg_run_pythia_1_4b {
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
# lora
function compare_ppl {
qnt="$1"
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
return 20
fi
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
return 0
}
path_lora="../models-mnt/open-llama/3B-v2/lora"
path_shakespeare="../models-mnt/shakespeare"
shakespeare="${path_shakespeare}/shakespeare.txt"
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
python3 ../convert-lora-to-ggml.py ${path_lora}
# f16
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0 + f16 lora-base
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
set +e
}
function gg_sum_pythia_1_4b {
function gg_sum_open_llama_3b_v2 {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Pythia 1.4B:\n'
gg_printf 'OpenLLaMA 3B-v2:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@@ -526,24 +380,32 @@ function gg_sum_pythia_1_4b {
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
}
# pythia_2_8b
# open_llama_7b_v2
# requires: GG_BUILD_CUDA
function gg_run_pythia_2_8b {
function gg_run_open_llama_7b_v2 {
cd ${SRC}
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
path_models="../models-mnt/pythia/2.8B"
path_models="../models-mnt/open-llama/7B-v2"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
@@ -553,7 +415,7 @@ function gg_run_pythia_2_8b {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python3 ../convert.py ${path_models}
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -606,10 +468,7 @@ function gg_run_pythia_2_8b {
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
@@ -630,7 +489,7 @@ function gg_run_pythia_2_8b {
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
@@ -638,16 +497,59 @@ function gg_run_pythia_2_8b {
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
# lora
function compare_ppl {
qnt="$1"
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
return 20
fi
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
return 0
}
path_lora="../models-mnt/open-llama/7B-v2/lora"
path_shakespeare="../models-mnt/shakespeare"
shakespeare="${path_shakespeare}/shakespeare.txt"
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
python3 ../convert-lora-to-ggml.py ${path_lora}
# f16
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# currently not supported by the CUDA backend
# q8_0
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
#compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0 + f16 lora-base
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
#compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
set +e
}
function gg_sum_pythia_2_8b {
function gg_sum_open_llama_7b_v2 {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Pythia 2.8B:\n'
gg_printf 'OpenLLaMA 7B-v2:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@@ -660,6 +562,11 @@ function gg_sum_pythia_2_8b {
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
#gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
}
# bge-small
@@ -688,7 +595,7 @@ function gg_run_embd_bge_small {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python3 ../convert-hf-to-gguf.py ${path_models}
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -735,17 +642,11 @@ test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run embd_bge_small
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
test $ret -eq 0 && gg_run test_scripts_debug
test $ret -eq 0 && gg_run test_scripts_release
fi
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then
test $ret -eq 0 && gg_run pythia_1_4b
test $ret -eq 0 && gg_run open_llama_3b_v2
else
test $ret -eq 0 && gg_run pythia_2_8b
#test $ret -eq 0 && gg_run open_llama_7b_v2
test $ret -eq 0 && gg_run open_llama_7b_v2
fi
test $ret -eq 0 && gg_run ctest_with_model_debug
test $ret -eq 0 && gg_run ctest_with_model_release

View File

@@ -1,16 +0,0 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target arm64-pc-windows-msvc )
set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

View File

@@ -1,6 +0,0 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target arm64-pc-windows-msvc )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )

View File

@@ -47,6 +47,9 @@ if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
set(TARGET json-schema-to-grammar)
add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h)
set(TARGET common)
add_library(${TARGET} STATIC
@@ -60,7 +63,6 @@ add_library(${TARGET} STATIC
grammar-parser.h
grammar-parser.cpp
json.hpp
json-schema-to-grammar.cpp
train.h
train.cpp
ngram-cache.h

File diff suppressed because it is too large Load Diff

View File

@@ -27,26 +27,19 @@
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0)
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
// build info
extern int LLAMA_BUILD_NUMBER;
extern char const * LLAMA_COMMIT;
extern char const * LLAMA_COMPILER;
extern char const * LLAMA_BUILD_TARGET;
extern char const *LLAMA_COMMIT;
extern char const *LLAMA_COMPILER;
extern char const *LLAMA_BUILD_TARGET;
struct llama_control_vector_load_info;
//
// CPU utils
//
int32_t cpu_get_num_physical_cores();
int32_t cpu_get_num_math();
int32_t get_num_physical_cores();
//
// CLI argument parsing
@@ -55,7 +48,7 @@ int32_t cpu_get_num_math();
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_threads = cpu_get_num_math();
int32_t n_threads = get_num_physical_cores();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
@@ -86,20 +79,16 @@ struct gpt_params {
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
std::string rpc_servers = ""; // comma separated list of RPC servers
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
// // sampling parameters
struct llama_sampling_params sparams;
std::string model = ""; // model path
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
std::string model_url = ""; // model url to download
@@ -140,14 +129,11 @@ struct gpt_params {
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
bool kl_divergence = false; // compute KL divergence
bool kl_divergence = false; // compute KL-divergence
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
bool special = false; // enable special token output
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
@@ -158,7 +144,6 @@ struct gpt_params {
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
@@ -171,47 +156,37 @@ struct gpt_params {
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data
std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
std::string mmproj = ""; // path to multimodal projector
std::string image = ""; // path to an image file
};
void gpt_params_handle_model_default(gpt_params & params);
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
std::string gpt_params_get_system_info(const gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
std::string get_system_info(const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
void process_escapes(std::string& input);
//
// String utils
//
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
std::vector<std::string> string_split(std::string input, char separator);
std::string string_strip(const std::string & str);
std::string string_get_sortable_timestamp();
std::string string_random_prompt(std::mt19937 & rng);
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);
//
// Filesystem utils
//
bool fs_validate_filename(const std::string & filename);
bool fs_create_directory_with_parents(const std::string & path);
std::string fs_get_cache_directory();
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
//
// Model utils
@@ -246,21 +221,20 @@ void llama_batch_add(
std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_special,
bool parse_special = false);
bool add_bos,
bool special = false);
std::vector<llama_token> llama_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_special,
bool parse_special = false);
bool add_bos,
bool special = false);
// tokenizes a token into a piece, optionally renders special/control tokens
// tokenizes a token into a piece
// should work similar to Python's `tokenizer.id_to_piece`
std::string llama_token_to_piece(
const struct llama_context * ctx,
llama_token token,
bool special = true);
llama_token token);
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
// that takes into account the tokenizer type and decides how to handle the leading space
@@ -282,15 +256,29 @@ std::string llama_detokenize_bpe(
// defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model);
//
// YAML utils
//
bool create_directory_with_parents(const std::string & path);
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
std::string get_sortable_timestamp();
void dump_non_result_info_yaml(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
//
// KV cache utils
//
// Dump the KV cache view with the number of sequences per cell.
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
// Dump the KV cache view showing individual sequences in each cell (long output).
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
//
// Embedding utils
@@ -324,20 +312,6 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
//
// Split utils
//
static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
//
// YAML utils
//
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
void yaml_dump_non_result_info(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

View File

@@ -26,7 +26,7 @@ namespace grammar_parser {
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
return result.first->second;
}
@@ -142,9 +142,6 @@ namespace grammar_parser {
pos++;
last_sym_start = out_elements.size();
while (*pos != '"') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos);
pos = char_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
@@ -159,9 +156,6 @@ namespace grammar_parser {
}
last_sym_start = out_elements.size();
while (*pos != ']') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos);
pos = char_pair.second;
enum llama_gretype type = last_sym_start < out_elements.size()
@@ -170,9 +164,6 @@ namespace grammar_parser {
out_elements.push_back({type, char_pair.first});
if (pos[0] == '-' && pos[1] != ']') {
if (!pos[1]) {
throw std::runtime_error("unexpected end of input");
}
auto endchar_pair = parse_char(pos + 1);
pos = endchar_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});

View File

@@ -11,101 +11,35 @@
using json = nlohmann::ordered_json;
template <typename Iterator>
static std::string join(Iterator begin, Iterator end, const std::string & separator);
static std::string repeat(const std::string & str, size_t n);
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
if (separator_rule.empty()) {
if (min_items == 0 && max_items == 1) {
return item_rule + "?";
} else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
return item_rule + "+";
}
}
std::string result;
if (min_items > 0) {
if (item_rule_is_literal && separator_rule.empty()) {
result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
} else {
std::vector<std::string> items(min_items, item_rule);
result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
}
}
std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
if (up_to_n == 0) {
return "";
} else if (up_to_n == 1) {
return "(" + content + ")?";
} else if (!separator_rule.empty() && !prefix_with_sep) {
return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
} else {
std::string res = repeat("(" + content + " ", up_to_n);
// strip trailing space
res = res.substr(0, res.length() - 1);
res += repeat(")?", up_to_n);
return res;
}
};
if (min_items > 0 && max_items != min_items) {
result += " ";
}
if (max_items != std::numeric_limits<int>::max()) {
result += opt_repetitions(max_items - min_items, min_items > 0);
} else {
std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
if (min_items == 0 && !separator_rule.empty()) {
result = "(" + item_rule + " " + item_operator + "*)?";
} else {
result += item_operator + "*";
}
}
return result;
}
const std::string SPACE_RULE = "\" \"?";
struct BuiltinRule {
std::string content;
std::vector<std::string> deps;
std::unordered_map<std::string, std::string> PRIMITIVE_RULES = {
{"boolean", "(\"true\" | \"false\") space"},
{"number", "(\"-\"? ([0-9] | [1-9] [0-9]*)) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space"},
{"integer", "(\"-\"? ([0-9] | [1-9] [0-9]*)) space"},
{"value", "object | array | string | number | boolean"},
{"object", "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space"},
{"array", "\"[\" space ( value (\",\" space value)* )? \"]\" space"},
{"uuid", "\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space"},
{"string", " \"\\\"\" (\n"
" [^\"\\\\] |\n"
" \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n"
" )* \"\\\"\" space"},
{"null", "\"null\" space"}
};
std::vector<std::string> OBJECT_RULE_NAMES = {"object", "array", "string", "number", "boolean", "null", "value"};
const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
{"boolean", {"(\"true\" | \"false\") space", {}}},
{"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
{"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
{"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
{"null", {"\"null\" space", {}}},
};
std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
{"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
{"date-time", {"date \"T\" time", {"date", "time"}}},
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
{"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
std::unordered_map<std::string, std::string> DATE_RULES = {
{"date", "[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )"},
{"time", "([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )"},
{"date-time", "date \"T\" time"},
{"date-string", "\"\\\"\" date \"\\\"\" space"},
{"time-string", "\"\\\"\" time \"\\\"\" space"},
{"date-time-string", "\"\\\"\" date-time \"\\\"\" space"}
};
static bool is_reserved_name(const std::string & name) {
@@ -113,7 +47,7 @@ static bool is_reserved_name(const std::string & name) {
if (RESERVED_NAMES.empty()) {
RESERVED_NAMES.insert("root");
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
for (const auto &p : DATE_RULES) RESERVED_NAMES.insert(p.first);
}
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
}
@@ -258,7 +192,7 @@ private:
if (_dotall) {
rule = "[\\U00000000-\\U0010FFFF]";
} else {
rule = "[^\\x0A\\x0D]";
rule = "[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]";
}
return _add_rule("dot", rule);
};
@@ -272,7 +206,7 @@ private:
if (literal.empty()) {
return false;
}
ret.emplace_back(literal, true);
ret.push_back(std::make_pair(literal, true));
literal.clear();
return true;
};
@@ -298,7 +232,7 @@ private:
while (i < length) {
char c = sub_pattern[i];
if (c == '.') {
seq.emplace_back(get_dot(), false);
seq.push_back(std::make_pair(get_dot(), false));
i++;
} else if (c == '(') {
i++;
@@ -307,7 +241,7 @@ private:
_warnings.push_back("Unsupported pattern syntax");
}
}
seq.emplace_back("(" + to_rule(transform()) + ")", false);
seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false));
} else if (c == ')') {
i++;
if (start > 0 && sub_pattern[start - 1] != '(') {
@@ -331,9 +265,9 @@ private:
}
square_brackets += ']';
i++;
seq.emplace_back(square_brackets, false);
seq.push_back(std::make_pair(square_brackets, false));
} else if (c == '|') {
seq.emplace_back("|", false);
seq.push_back(std::make_pair("|", false));
i++;
} else if (c == '*' || c == '+' || c == '?') {
seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
@@ -374,21 +308,47 @@ private:
auto &sub = last.first;
auto sub_is_literal = last.second;
if (!sub_is_literal) {
std::string & sub_id = sub_rule_ids[sub];
if (sub_id.empty()) {
sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
if (min_times == 0 && max_times == std::numeric_limits<int>::max()) {
sub += "*";
} else if (min_times == 0 && max_times == 1) {
sub += "?";
} else if (min_times == 1 && max_times == std::numeric_limits<int>::max()) {
sub += "+";
} else {
if (!sub_is_literal) {
std::string & sub_id = sub_rule_ids[sub];
if (sub_id.empty()) {
sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
}
sub = sub_id;
}
sub = sub_id;
std::string result;
if (sub_is_literal && min_times > 0) {
result = "\"" + repeat(sub.substr(1, sub.length() - 2), min_times) + "\"";
} else {
for (int j = 0; j < min_times; j++) {
if (j > 0) {
result += " ";
}
result += sub;
}
}
if (min_times > 0 && min_times < max_times) {
result += " ";
}
if (max_times == std::numeric_limits<int>::max()) {
result += sub + "*";
} else {
for (int j = min_times; j < max_times; j++) {
if (j > min_times) {
result += " ";
}
result += sub + "?";
}
}
seq.back().first = result;
seq.back().second = false;
}
seq.back().first = build_repetition(
sub_is_literal ? "\"" + sub + "\"" : sub,
min_times,
max_times,
"",
sub_is_literal
);
seq.back().second = false;
} else {
std::string literal;
auto is_non_literal = [&](char c) {
@@ -417,7 +377,7 @@ private:
}
}
if (!literal.empty()) {
seq.emplace_back(literal, true);
seq.push_back(std::make_pair(literal, true));
}
}
}
@@ -464,7 +424,7 @@ private:
if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
std::string kv_rule = _add_rule(sub_name + "-kv", _add_rule("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
prop_kv_rule_names["*"] = kv_rule;
optional_props.push_back("*");
}
@@ -526,25 +486,6 @@ private:
return rule;
}
std::string _add_primitive(const std::string & name, const BuiltinRule & rule) {
auto n = _add_rule(name, rule.content);
for (const auto & dep : rule.deps) {
BuiltinRule dep_rule;
auto it = PRIMITIVE_RULES.find(dep);
if (it == PRIMITIVE_RULES.end()) {
it = STRING_FORMAT_RULES.find(dep);
if (it == STRING_FORMAT_RULES.end()) {
_errors.push_back("Rule " + dep + " not known");
continue;
}
}
if (_rules.find(dep) == _rules.end()) {
_add_primitive(dep, it->second);
}
}
return n;
}
public:
SchemaConverter(
const std::function<json(const std::string &)> & fetch_json,
@@ -706,33 +647,49 @@ public:
return _add_rule(rule_name, rule);
} else {
std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
std::string list_item_operator = "( \",\" space " + item_rule_name + " )";
std::string successive_items;
int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : -1;
if (min_items > 0) {
successive_items += repeat(list_item_operator, min_items - 1);
min_items--;
}
if (max_items >= 0 && max_items > min_items) {
successive_items += repeat(list_item_operator + "?", max_items - min_items - 1);
} else {
successive_items += list_item_operator + "*";
}
std::string rule;
if (min_items == 0) {
rule = "\"[\" space ( " + item_rule_name + " " + successive_items + " )? \"]\" space";
} else {
rule = "\"[\" space " + item_rule_name + " " + successive_items + " \"]\" space";
}
return _add_rule(rule_name, rule);
}
} else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
return _visit_pattern(schema["pattern"], rule_name);
} else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
} else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
auto prim_name = schema_format + "-string";
return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
} else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
return _add_rule(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
} else if ((schema_type.is_null() || schema_type == "string") && DATE_RULES.find(schema_format) != DATE_RULES.end()) {
for (const auto & kv : DATE_RULES) {
_add_rule(kv.first, kv.second);
}
return schema_format + "-string";
} else if (schema.empty() || schema_type == "object") {
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
for (const auto & n : OBJECT_RULE_NAMES) {
_add_rule(n, PRIMITIVE_RULES.at(n));
}
return _add_rule(rule_name, "object");
} else {
if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
_errors.push_back("Unrecognized schema: " + schema.dump());
return "";
}
// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
return _add_primitive(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
return _add_rule(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
}
}

View File

@@ -1,8 +1,4 @@
#pragma once
#include "ggml.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);

View File

@@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#endif
#else
#define LOG_FLF_FMT "%s"
@@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#endif
#else
#define LOG_TEE_FLF_FMT "%s"
@@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// INTERNAL, DO NOT USE
// USE LOG() INSTEAD
//
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
#define LOG_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
@@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// INTERNAL, DO NOT USE
// USE LOG_TEE() INSTEAD
//
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
#define LOG_TEE_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
@@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// Main LOG macro.
// behaves like printf, and supports arguments the exact same way.
//
#if !defined(_MSC_VER) || defined(__clang__)
#ifndef _MSC_VER
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
@@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// Secondary target can be changed just like LOG_TARGET
// by defining LOG_TEE_TARGET
//
#if !defined(_MSC_VER) || defined(__clang__)
#ifndef _MSC_VER
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif
// LOG macro variants with auto endline.
#if !defined(_MSC_VER) || defined(__clang__)
#ifndef _MSC_VER
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else

View File

@@ -1,6 +1,4 @@
#define LLAMA_API_INTERNAL
#include "sampling.h"
#include <random>
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
struct llama_sampling_context * result = new llama_sampling_context();
@@ -35,10 +33,6 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
result->prev.resize(params.n_prev);
result->n_valid = 0;
llama_sampling_set_rng_seed(result, params.seed);
return result;
}
@@ -66,14 +60,6 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
ctx->cur.clear();
ctx->n_valid = 0;
}
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
if (seed == LLAMA_DEFAULT_SEED) {
seed = std::random_device{}();
}
ctx->rng.seed(seed);
}
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
@@ -125,7 +111,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
std::string result = "CFG -> Penalties ";
if (params.mirostat == 0) {
for (auto sampler_type : params.samplers_sequence) {
const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
if (!sampler_type_name.empty()) {
result += "-> " + sampler_type_name + " ";
}
@@ -137,87 +123,6 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
return result;
}
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
switch (sampler_type) {
case llama_sampler_type::TOP_K: return "top_k";
case llama_sampler_type::TFS_Z: return "tfs_z";
case llama_sampler_type::TYPICAL_P: return "typical_p";
case llama_sampler_type::TOP_P: return "top_p";
case llama_sampler_type::MIN_P: return "min_p";
case llama_sampler_type::TEMPERATURE: return "temperature";
default : return "";
}
}
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
{"top_k", llama_sampler_type::TOP_K},
{"top_p", llama_sampler_type::TOP_P},
{"typical_p", llama_sampler_type::TYPICAL_P},
{"min_p", llama_sampler_type::MIN_P},
{"tfs_z", llama_sampler_type::TFS_Z},
{"temperature", llama_sampler_type::TEMPERATURE}
};
// since samplers names are written multiple ways
// make it ready for both system names and input names
std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
{"top-k", llama_sampler_type::TOP_K},
{"top-p", llama_sampler_type::TOP_P},
{"nucleus", llama_sampler_type::TOP_P},
{"typical-p", llama_sampler_type::TYPICAL_P},
{"typical", llama_sampler_type::TYPICAL_P},
{"min-p", llama_sampler_type::MIN_P},
{"tfs-z", llama_sampler_type::TFS_Z},
{"tfs", llama_sampler_type::TFS_Z},
{"temp", llama_sampler_type::TEMPERATURE}
};
std::vector<llama_sampler_type> sampler_types;
sampler_types.reserve(names.size());
for (const auto & name : names)
{
auto sampler_item = sampler_canonical_name_map.find(name);
if (sampler_item != sampler_canonical_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
else
{
if (allow_alt_names)
{
sampler_item = sampler_alt_name_map.find(name);
if (sampler_item != sampler_alt_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
}
}
}
return sampler_types;
}
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
std::unordered_map<char, llama_sampler_type> sampler_name_map {
{'k', llama_sampler_type::TOP_K},
{'p', llama_sampler_type::TOP_P},
{'y', llama_sampler_type::TYPICAL_P},
{'m', llama_sampler_type::MIN_P},
{'f', llama_sampler_type::TFS_Z},
{'t', llama_sampler_type::TEMPERATURE}
};
std::vector<llama_sampler_type> sampler_types;
sampler_types.reserve(names_string.size());
for (const auto & c : names_string) {
const auto sampler_item = sampler_name_map.find(c);
if (sampler_item != sampler_name_map.end()) {
sampler_types.push_back(sampler_item->second);
}
}
return sampler_types;
}
// no reasons to expose this function in header
static void sampler_queue(
struct llama_context * ctx_main,
@@ -260,7 +165,7 @@ static llama_token llama_sampling_sample_impl(
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx,
bool is_resampling) {
bool is_resampling) { // Add a parameter to indicate if we are resampling
const llama_sampling_params & params = ctx_sampling->params;
const float temp = params.temp;
@@ -269,8 +174,8 @@ static llama_token llama_sampling_sample_impl(
const float mirostat_eta = params.mirostat_eta;
std::vector<float> original_logits;
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
if (ctx_sampling->grammar != NULL && !is_resampling) {
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits);
if (!is_resampling) {
GGML_ASSERT(!original_logits.empty());
}
llama_token id = 0;
@@ -298,7 +203,7 @@ static llama_token llama_sampling_sample_impl(
sampler_queue(ctx_main, params, cur_p, min_keep);
id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
id = llama_sample_token(ctx_main, &cur_p);
//{
// const int n_top = 10;
@@ -333,12 +238,10 @@ static llama_token llama_sampling_sample_impl(
// Restore logits from the copy
std::copy(original_logits.begin(), original_logits.end(), logits);
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling
}
}
ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
return id;
}
@@ -366,8 +269,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);
if (ctx_sampling->grammar != NULL && !apply_grammar) {
GGML_ASSERT(original_logits != NULL);
if (apply_grammar && original_logits != NULL) {
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
}
@@ -424,7 +326,7 @@ llama_token llama_sampling_sample(
struct llama_context * ctx_cfg,
const int idx) {
// Call the implementation function with is_resampling set to false by default
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
}
llama_token_data_array llama_sampling_prepare(

View File

@@ -4,10 +4,9 @@
#include "grammar-parser.h"
#include <random>
#include <string>
#include <unordered_map>
#include <vector>
#include <unordered_map>
// sampler types
enum class llama_sampler_type : char {
@@ -21,26 +20,25 @@ enum class llama_sampler_type : char {
// sampling parameters
typedef struct llama_sampling_params {
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
std::vector<llama_sampler_type> samplers_sequence = {
llama_sampler_type::TOP_K,
@@ -81,9 +79,6 @@ struct llama_sampling_context {
// TODO: replace with ring-buffer
std::vector<llama_token> prev;
std::vector<llama_token_data> cur;
size_t n_valid; // Number of correct top tokens with correct probabilities.
std::mt19937 rng;
};
#include "common.h"
@@ -98,9 +93,6 @@ void llama_sampling_free(struct llama_sampling_context * ctx);
// - reset grammar
void llama_sampling_reset(llama_sampling_context * ctx);
// Set the sampler seed
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
// Copy the sampler context
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
@@ -116,11 +108,6 @@ std::string llama_sampling_print(const llama_sampling_params & params);
// Print sampling order into a string
std::string llama_sampling_order_print(const llama_sampling_params & params);
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
// Note: When using multiple sequences, it is the caller's responsibility to call
@@ -142,7 +129,7 @@ llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
int idx = -1);
int idx = 0);
// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
llama_token_data_array llama_sampling_prepare(

View File

@@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
params.custom_n_ctx = false;
params.use_flash = false;
params.use_flash = true;
params.use_checkpointing = true;
params.sample_start = "";
@@ -1380,7 +1380,7 @@ bool consume_common_train_arg(
void finish_processing_train_args(struct train_params_common * params) {
if (params->escape) {
string_process_escapes(params->sample_start);
process_escapes(params->sample_start);
}
}

View File

@@ -1,327 +0,0 @@
#!/usr/bin/env python3
# This script downloads the tokenizer models of the specified models from Huggingface and
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
#
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
# provide the necessary information to llama.cpp via the GGUF header in order to implement
# the same pre-tokenizer.
#
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
#
# Instructions:
#
# - Add a new model to the "models" list
# - Run the script with your huggingface token:
#
# python3 convert-hf-to-gguf-update.py <huggingface_token>
#
# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
# - Update llama.cpp with the new pre-tokenizer if necessary
#
# TODO: generate tokenizer tests for llama.cpp
#
import logging
import os
import pathlib
import re
import requests
import sys
import json
from hashlib import sha256
from enum import IntEnum, auto
from transformers import AutoTokenizer
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("convert-hf-to-gguf-update")
sess = requests.Session()
class TOKENIZER_TYPE(IntEnum):
SPM = auto()
BPE = auto()
WPM = auto()
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
if len(sys.argv) == 2:
token = sys.argv[1]
if not token.startswith("hf_"):
logger.info("Huggingface token seems invalid")
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
sys.exit(1)
else:
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
sys.exit(1)
# TODO: add models here, base models preferred
models = [
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
]
def download_file_with_auth(url, token, save_path):
headers = {"Authorization": f"Bearer {token}"}
response = sess.get(url, headers=headers)
response.raise_for_status()
os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, 'wb') as f:
f.write(response.content)
logger.info(f"File {save_path} downloaded successfully")
def download_model(model):
name = model["name"]
repo = model["repo"]
tokt = model["tokt"]
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
if tokt == TOKENIZER_TYPE.SPM:
files.append("tokenizer.model")
for file in files:
save_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(save_path):
logger.info(f"{name}: File {save_path} already exists - skipping")
continue
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
for model in models:
try:
download_model(model)
except Exception as e:
logger.error(f"Failed to download model {model['name']}. Error: {e}")
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
src_ifs = ""
for model in models:
name = model["name"]
tokt = model["tokt"]
if tokt == TOKENIZER_TYPE.SPM:
continue
# Skip if the tokenizer folder does not exist or there are other download issues previously
if not os.path.exists(f"models/tokenizers/{name}"):
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
continue
# create the tokenizer
try:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except OSError as e:
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.info(f"model: {name}")
logger.info(f"tokt: {tokt}")
logger.info(f"repo: {model['repo']}")
logger.info(f"chktok: {chktok}")
logger.info(f"chkhsh: {chkhsh}")
# print the "pre_tokenizer" content from the tokenizer.json
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
cfg = json.load(f)
normalizer = cfg["normalizer"]
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
pre_tokenizer = cfg["pre_tokenizer"]
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
if "ignore_merges" in cfg["model"]:
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
logger.info("")
src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
src_ifs += f" # ref: {model['repo']}\n"
src_ifs += f" res = \"{name}\"\n"
src_func = f"""
def get_vocab_base_pre(self, tokenizer) -> str:
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
# is specific for the BPE pre-tokenizer used by the model
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer
chktxt = {repr(chktxt)}
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.debug(f"chktok: {{chktok}}")
logger.debug(f"chkhsh: {{chkhsh}}")
res = None
# NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
# or pull the latest version of the model from Huggingface
# don't edit the hashes manually!
{src_ifs}
if res is None:
logger.warning("\\n")
logger.warning("**************************************************************************************")
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
logger.warning("** There are 2 possible reasons for this:")
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
logger.warning("** - the pre-tokenization config has changed upstream")
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
logger.warning("**")
logger.warning(f"** chkhsh: {{chkhsh}}")
logger.warning("**************************************************************************************")
logger.warning("\\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
logger.debug(f"chkhsh: {{chkhsh}}")
return res
"""
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
convert_py = convert_py_pth.read_text()
convert_py = re.sub(
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
lambda m: m.group(1) + src_func + m.group(3),
convert_py,
flags=re.DOTALL | re.MULTILINE,
)
convert_py_pth.write_text(convert_py)
logger.info("+++ convert-hf-to-gguf.py was updated")
# generate tests for each tokenizer model
tests = [
"ied 4 ½ months",
"Führer",
"",
" ",
" ",
" ",
"\t",
"\n",
"\n\n",
"\n\n\n",
"\t\n",
"Hello world",
" Hello world",
"Hello World",
" Hello World",
" Hello World!",
"Hello, world!",
" Hello, world!",
" this is 🦙.cpp",
"w048 7tuijk dsdfhu",
"нещо на Български",
"កាន់តែពិសេសអាចខលចេញ",
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
"Hello",
" Hello",
" Hello",
" Hello",
" Hello",
" Hello\n Hello",
" (",
"\n =",
"' era",
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天",
"3",
"33",
"333",
"3333",
"33333",
"333333",
"3333333",
"33333333",
"333333333",
# "Cửa Việt", # llama-bpe fails on this
chktxt,
]
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
# the format is:
#
# test0
# __ggml_vocab_test__
# test1
# __ggml_vocab_test__
# ...
#
# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
# for each test, write the resulting tokens on a separate line
for model in models:
name = model["name"]
tokt = model["tokt"]
# Skip if the tokenizer folder does not exist or there are other download issues previously
if not os.path.exists(f"models/tokenizers/{name}"):
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
continue
# create the tokenizer
try:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except OSError as e:
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
continue # Skip this model and continue with the next one in the loop
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
for text in tests:
f.write(f"{text}")
f.write("\n__ggml_vocab_test__\n")
with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
for text in tests:
res = tokenizer.encode(text, add_special_tokens=False)
for r in res:
f.write(f" {r}")
f.write("\n")
logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
# generate commands for creating vocab files
logger.info("\nRun the following commands to generate the vocab files for testing:\n")
for model in models:
name = model["name"]
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
logger.info("\n")

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,6 @@
#!/usr/bin/env python3
from __future__ import annotations
import logging
import argparse
import os
import struct
@@ -15,8 +14,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
logger = logging.getLogger("ggml-to-gguf")
class GGMLFormat(IntEnum):
GGML = 0
@@ -128,6 +125,7 @@ class Tensor:
self.start_offset = offset
self.len_bytes = n_bytes
offset += n_bytes
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
return offset - orig_offset
@@ -177,7 +175,7 @@ class GGMLModel:
offset += self.validate_header(data, offset)
hp = Hyperparameters()
offset += hp.load(data, offset)
logger.info(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
self.validate_conversion(hp.ftype)
vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
offset += vocab.load(data, offset, hp.n_vocab)
@@ -217,12 +215,12 @@ class GGMLToGGUF:
if float(hp.n_head) / float(x) == gqa:
n_kv_head = x
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
self.n_kv_head = n_kv_head
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
def save(self):
logger.info('* Preparing to save GGUF file')
print('* Preparing to save GGUF file')
gguf_writer = gguf.GGUFWriter(
self.cfg.output,
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
@@ -232,11 +230,11 @@ class GGMLToGGUF:
if self.special_vocab is not None:
self.special_vocab.add_to_gguf(gguf_writer)
self.add_tensors(gguf_writer)
logger.info(" gguf: write header")
print(" gguf: write header")
gguf_writer.write_header_to_file()
logger.info(" gguf: write metadata")
print(" gguf: write metadata")
gguf_writer.write_kv_data_to_file()
logger.info(" gguf: write tensors")
print(" gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
@@ -252,7 +250,7 @@ class GGMLToGGUF:
name = cfg.name if cfg.name is not None else cfg.input.name
except UnicodeDecodeError:
name = None
logger.info('* Adding model parameters and KV items')
print('* Adding model parameters and KV items')
if name is not None:
gguf_writer.add_name(name)
gguf_writer.add_description(desc)
@@ -283,13 +281,12 @@ class GGMLToGGUF:
def add_vocab(self, gguf_writer):
hp = self.model.hyperparameters
gguf_writer.add_tokenizer_model('llama')
gguf_writer.add_tokenizer_pre('default')
tokens = []
scores = []
toktypes = []
if self.vocab_override is not None:
vo = self.vocab_override
logger.info('* Adding vocab item(s)')
print('* Adding vocab item(s)')
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
tokens.append(vbytes)
scores.append(score)
@@ -301,7 +298,7 @@ class GGMLToGGUF:
if len(toktypes) > 0:
gguf_writer.add_token_types(toktypes)
return
logger.info(f'* Adding {hp.n_vocab} vocab item(s)')
print(f'* Adding {hp.n_vocab} vocab item(s)')
assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
tt = 1 # Normal
@@ -336,7 +333,7 @@ class GGMLToGGUF:
def add_tensors(self, gguf_writer):
tensor_map = self.name_map
data = self.data
logger.info(f'* Adding {len(self.model.tensors)} tensor(s)')
print(f'* Adding {len(self.model.tensors)} tensor(s)')
for tensor in self.model.tensors:
name = str(tensor.name, 'UTF-8')
mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
@@ -346,6 +343,7 @@ class GGMLToGGUF:
temp = tempdims[1]
tempdims[1] = tempdims[0]
tempdims[0] = temp
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
gguf_writer.add_tensor(
mapped_name,
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
@@ -402,35 +400,33 @@ def handle_args():
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", default="spm,hfft",
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
return parser.parse_args()
def main():
cfg = handle_args()
logging.basicConfig(level=logging.DEBUG if cfg.verbose else logging.INFO)
logger.info(f'* Using config: {cfg}')
logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===')
print(f'* Using config: {cfg}')
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
data = np.memmap(cfg.input, mode = 'r')
model = GGMLModel()
logger.info('* Scanning GGML input file')
print('* Scanning GGML input file')
offset = model.load(data, 0) # noqa
logger.info(f'* GGML model hyperparameters: {model.hyperparameters}')
print(f'* GGML model hyperparameters: {model.hyperparameters}')
vocab_override = None
params_override = None
special_vocab = None
if cfg.model_metadata_dir is not None:
(params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
logger.info('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
logger.info(f'* Overriding params: {params_override}')
logger.info(f'* Overriding vocab: {vocab_override}')
logger.info(f'* Special vocab: {special_vocab}')
print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
print(f'* Overriding params: {params_override}')
print(f'* Overriding vocab: {vocab_override}')
print(f'* Special vocab: {special_vocab}')
else:
logger.warning('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
if model.file_format == GGMLFormat.GGML:
logger.info('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
converter = GGMLToGGUF(
model, data, cfg,
params_override = params_override,
@@ -438,7 +434,7 @@ def main():
special_vocab = special_vocab
)
converter.save()
logger.info(f'* Successful completion. Output saved to: {cfg.output}')
print(f'* Successful completion. Output saved to: {cfg.output}')
if __name__ == '__main__':

148
convert-lora-to-ggml.py Executable file
View File

@@ -0,0 +1,148 @@
#!/usr/bin/env python3
from __future__ import annotations
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any, BinaryIO, Sequence
import numpy as np
import torch
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
fout.write(b"ggla"[::-1]) # magic (ggml lora)
fout.write(struct.pack("i", 1)) # file version
fout.write(struct.pack("i", params["r"]))
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
# but some models ship a float value instead
# let's convert to int, but fail if lossless conversion is not possible
assert (
int(params["lora_alpha"]) == params["lora_alpha"]
), "cannot convert float to int losslessly"
fout.write(struct.pack("i", int(params["lora_alpha"])))
def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
sname = name.encode("utf-8")
fout.write(
struct.pack(
"iii",
len(shape),
len(sname),
NUMPY_TYPE_TO_FTYPE[data_type.name],
)
)
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
fout.write(sname)
fout.seek((fout.tell() + 31) & -32)
if __name__ == '__main__':
if len(sys.argv) < 2:
print(f"Usage: python {sys.argv[0]} <path> [arch]")
print(
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
)
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
sys.exit(1)
input_json = os.path.join(sys.argv[1], "adapter_config.json")
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
if os.path.exists(input_model):
model = torch.load(input_model, map_location="cpu")
else:
input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
# lazy import load_file only if lora is in safetensors format.
from safetensors.torch import load_file
model = load_file(input_model, device="cpu")
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
print(f"Error: unsupported architecture {arch_name}")
sys.exit(1)
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
with open(input_json, "r") as f:
params = json.load(f)
if params["peft_type"] != "LORA":
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
sys.exit(1)
if params["fan_in_fan_out"] is True:
print("Error: param fan_in_fan_out is not supported")
sys.exit(1)
if params["bias"] is not None and params["bias"] != "none":
print("Error: param bias is not supported")
sys.exit(1)
# TODO: these seem to be layers that have been trained but without lora.
# doesn't seem widely used but eventually should be supported
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
print("Error: param modules_to_save is not supported")
sys.exit(1)
with open(output_path, "wb") as fout:
fout.truncate()
write_file_header(fout, params)
for k, v in model.items():
orig_k = k
if k.endswith(".default.weight"):
k = k.replace(".default.weight", ".weight")
if k in ["llama_proj.weight", "llama_proj.bias"]:
continue
if k.endswith("lora_A.weight"):
if v.dtype != torch.float16 and v.dtype != torch.float32:
v = v.float()
v = v.T
else:
v = v.float()
t = v.detach().numpy()
prefix = "base_model.model."
if k.startswith(prefix):
k = k[len(prefix) :]
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
if k.endswith(lora_suffixes):
suffix = k[-len(lora_suffixes[0]):]
k = k[: -len(lora_suffixes[0])]
else:
print(f"Error: unrecognized tensor name {orig_k}")
sys.exit(1)
tname = name_map.get_name(k)
if tname is None:
print(f"Error: could not map tensor name {orig_k}")
print(" Note: the arch parameter must be specified if the model is not llama")
sys.exit(1)
if suffix == ".lora_A.weight":
tname += ".weight.loraA"
elif suffix == ".lora_B.weight":
tname += ".weight.loraB"
else:
assert False
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
write_tensor_header(fout, tname, t.shape, t.dtype)
t.tofile(fout)
print(f"Converted {input_json} and {input_model} to {output_path}")

136
convert-persimmon-to-gguf.py Executable file
View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python3
import argparse
import os
import sys
from pathlib import Path
from pprint import pprint
import torch
from sentencepiece import SentencePieceProcessor
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
def _flatten_dict(dct, tensors, prefix=None):
assert isinstance(dct, dict)
for key in dct.keys():
new_prefix = prefix + '.' + key if prefix is not None else key
if isinstance(dct[key], torch.Tensor):
tensors[new_prefix] = dct[key]
elif isinstance(dct[key], dict):
_flatten_dict(dct[key], tensors, new_prefix)
else:
raise ValueError(type(dct[key]))
return None
def _get_sentencepiece_tokenizer_info(dir_model: Path):
tokenizer_path = dir_model / 'adept_vocab.model'
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
tokenizer = SentencePieceProcessor(str(tokenizer_path))
print('gguf: adding tokens')
tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
for i in range(tokenizer.vocab_size()):
text: bytes
score: float
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score = tokenizer.get_score(i)
toktype = 1
if tokenizer.is_unknown(i):
toktype = 2
if tokenizer.is_control(i):
toktype = 3
if tokenizer.is_unused(i):
toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
pass
return tokens, scores, toktypes
def main():
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
args = parser.parse_args()
sys.path.append(str(args.adept_inference_dir))
persimmon_model = torch.load(args.ckpt_path)
hparams = persimmon_model['args']
pprint(hparams)
tensors: dict[str, torch.Tensor] = {}
_flatten_dict(persimmon_model['model'], tensors, None)
arch = gguf.MODEL_ARCH.PERSIMMON
gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
block_count = hparams.num_layers
head_count = hparams.num_attention_heads
head_count_kv = head_count
ctx_length = hparams.seq_length
hidden_size = hparams.hidden_size
gguf_writer.add_name('persimmon-8b-chat')
gguf_writer.add_context_length(ctx_length)
gguf_writer.add_embedding_length(hidden_size)
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
# ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
gguf_writer.add_tokenizer_model('llama')
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
gguf_writer.add_bos_token_id(71013)
gguf_writer.add_eos_token_id(71013)
tensor_map = gguf.get_tensor_name_map(arch, block_count)
print(tensor_map)
for name in tensors.keys():
data_torch = tensors[name]
if name.endswith(".self_attention.rotary_emb.inv_freq"):
continue
old_dtype = data_torch.dtype
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
data = data_torch.to(torch.float32).squeeze().numpy()
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{args.outfile}'")
print("")
if __name__ == '__main__':
main()

View File

@@ -1,7 +1,6 @@
#!/usr/bin/env python3
from __future__ import annotations
import logging
import argparse
import concurrent.futures
import enum
@@ -24,21 +23,17 @@ from abc import ABC, abstractmethod
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
import numpy as np
from sentencepiece import SentencePieceProcessor
if 'NO_LOCAL_GGUF' not in os.environ:
# use .parent.parent since we are in "examples" directory
sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab
if TYPE_CHECKING:
from typing_extensions import Self, TypeAlias
logger = logging.getLogger("convert")
from typing import TypeAlias
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
faulthandler.register(signal.SIGUSR1)
@@ -144,8 +139,7 @@ class GGMLFileType(enum.IntEnum):
dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
if dt is None:
raise ValueError(self)
# Convert all 1D tensors to F32. Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
# Also The 1d tensors aren't much of a performance/size issue. So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
# 1D tensors are always F32.
return dt if len(tensor.shape) > 1 else DT_F32
@@ -286,7 +280,6 @@ class Params:
n_experts = None
n_experts_used = None
f_rope_freq_base = None
n_ff = None
# hack to determine LLaMA v1 vs v2 vs CodeLlama
if config.get("moe"):
@@ -311,8 +304,6 @@ class Params:
n_experts_used = config["moe"]["num_experts_per_tok"]
f_rope_freq_base = 1e6
assert n_ff is not None
return Params(
n_vocab = model["tok_embeddings.weight"].shape[0],
n_embd = config["dim"],
@@ -346,40 +337,297 @@ class Params:
return params
@dataclass
class Metadata:
name: Optional[str] = None
author: Optional[str] = None
version: Optional[str] = None
url: Optional[str] = None
description: Optional[str] = None
licence: Optional[str] = None
source_url: Optional[str] = None
source_hf_repo: Optional[str] = None
#
# vocab
#
@staticmethod
def load(metadata_path: Path) -> Metadata:
if metadata_path is None or not metadata_path.exists():
return Metadata()
@runtime_checkable
class BaseVocab(Protocol):
tokenizer_model: ClassVar[str]
name: ClassVar[str]
with open(metadata_path, 'r') as file:
data = json.load(file)
# Create a new Metadata instance
metadata = Metadata()
class NoVocab(BaseVocab):
tokenizer_model = "no_vocab"
name = "no_vocab"
# Assigning values to Metadata attributes if they exist in the JSON file
# This is based on LLM_KV_NAMES mapping in llama.cpp
metadata.name = data.get("general.name")
metadata.author = data.get("general.author")
metadata.version = data.get("general.version")
metadata.url = data.get("general.url")
metadata.description = data.get("general.description")
metadata.license = data.get("general.license")
metadata.source_url = data.get("general.source.url")
metadata.source_hf_repo = data.get("general.source.huggingface.repository")
def __repr__(self) -> str:
return "<NoVocab for a model without integrated vocabulary>"
return metadata
@runtime_checkable
class Vocab(BaseVocab, Protocol):
vocab_size: int
added_tokens_dict: dict[str, int]
added_tokens_list: list[str]
fname_tokenizer: Path
def __init__(self, base_path: Path): ...
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
class BpeVocab(Vocab):
tokenizer_model = "gpt2"
name = "bpe"
def __init__(self, base_path: Path):
added_tokens: dict[str, int] = {}
if (fname_tokenizer := base_path / 'vocab.json').exists():
# "slow" tokenizer
with open(fname_tokenizer, encoding="utf-8") as f:
self.vocab = json.load(f)
try:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
added_tokens = json.load(f)
except FileNotFoundError:
pass
else:
# "fast" tokenizer
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
# if this fails, FileNotFoundError propagates to caller
with open(fname_tokenizer, encoding="utf-8") as f:
tokenizer_json = json.load(f)
tokenizer_model: dict[str, Any] = tokenizer_json['model']
if (
tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'ByteLevel'
):
raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
self.vocab = tokenizer_model["vocab"]
if (added := tokenizer_json.get('added_tokens')) is not None:
# Added tokens here can be duplicates of the main vocabulary.
added_tokens = {item['content']: item['id']
for item in added
if item['content'] not in self.vocab}
vocab_size = len(self.vocab)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
expected_end_id = vocab_size + len(actual_ids) - 1
raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
f"{vocab_size} - {expected_end_id}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_dict = added_tokens
self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
for i, _ in enumerate(self.vocab):
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.bpe_tokens()
yield from self.added_tokens()
def __repr__(self) -> str:
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class SentencePieceVocab(Vocab):
tokenizer_model = "llama"
name = "spm"
def __init__(self, base_path: Path):
added_tokens: dict[str, int] = {}
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
# normal location
try:
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
added_tokens = json.load(f)
except FileNotFoundError:
pass
elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
# not found in alternate location either
raise FileNotFoundError('Cannot find tokenizer.model')
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
vocab_size = self.sentencepiece_tokenizer.vocab_size()
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
actual_new_ids = sorted(new_tokens.keys())
if expected_new_ids != actual_new_ids:
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
# Token pieces that were added to the base vocabulary.
self.added_tokens_dict = added_tokens
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.sentencepiece_tokenizer
for i in range(tokenizer.vocab_size()):
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score: float = tokenizer.get_score(i)
toktype = gguf.TokenType.NORMAL
if tokenizer.is_unknown(i):
toktype = gguf.TokenType.UNKNOWN
if tokenizer.is_control(i):
toktype = gguf.TokenType.CONTROL
# NOTE: I think added_tokens are user defined.
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
if tokenizer.is_unused(i):
toktype = gguf.TokenType.UNUSED
if tokenizer.is_byte(i):
toktype = gguf.TokenType.BYTE
yield text, score, toktype
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.sentencepiece_tokens()
yield from self.added_tokens()
def __repr__(self) -> str:
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class LlamaHfVocab(Vocab):
tokenizer_model = "llama"
name = "hfft"
def __init__(self, base_path: Path, ignore_nonllama: bool = False):
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
# if this fails, FileNotFoundError propagates to caller
with open(fname_tokenizer, encoding='utf-8') as f:
tokenizer_json = json.load(f)
# pre-check so we know if we need transformers
tokenizer_model: dict[str, Any] = tokenizer_json['model']
if ignore_nonllama:
pass # workaround incorrect use of this class for WordPiece
elif (
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'Sequence'
):
raise FileNotFoundError('Cannot find Llama BPE tokenizer')
try:
from transformers import AutoTokenizer
except ImportError as e:
raise ImportError(
"To use LlamaHfVocab, please install the `transformers` package. "
"You can install it with `pip install transformers`."
) from e
# Allow the tokenizer to default to slow or fast versions.
# Explicitly set tokenizer to use local paths.
self.tokenizer = AutoTokenizer.from_pretrained(
base_path,
cache_dir=base_path,
local_files_only=True,
)
assert self.tokenizer.is_fast # assume tokenizer.json is used
# Initialize lists and dictionaries for added tokens
self.added_tokens_list = []
self.added_tokens_dict = dict()
self.added_tokens_ids = set()
# Process added tokens
for tok, tokidx in sorted(
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
):
# Only consider added tokens that are not in the base vocabulary
if tokidx >= self.tokenizer.vocab_size:
self.added_tokens_list.append(tok)
self.added_tokens_dict[tok] = tokidx
self.added_tokens_ids.add(tokidx)
# Store special tokens and their IDs
self.specials = {
tok: self.tokenizer.get_vocab()[tok]
for tok in self.tokenizer.all_special_tokens
}
self.special_ids = set(self.tokenizer.all_special_ids)
# Set vocabulary sizes
self.vocab_size_base = self.tokenizer.vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = {
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
}
for token_id in range(self.vocab_size_base):
# Skip processing added tokens here
if token_id in self.added_tokens_ids:
continue
# Convert token text to bytes
token_text = reverse_vocab[token_id].encode("utf-8")
# Yield token text, score, and type
yield token_text, self.get_token_score(token_id), self.get_token_type(
token_id, token_text, self.special_ids # Reuse already stored special IDs
)
def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
# Special case for byte tokens
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
return gguf.TokenType.BYTE
# Determine token type based on whether it's a special token
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
def get_token_score(self, token_id: int) -> float:
# Placeholder for actual logic to determine the token's score
# This needs to be implemented based on specific requirements
return -1000.0 # Default score
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
if text in self.specials:
toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
score = self.get_token_score(self.specials[text])
else:
toktype = gguf.TokenType.USER_DEFINED
score = -1000.0
yield text.encode("utf-8"), score, toktype
def has_newline_token(self):
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.hf_tokens()
yield from self.added_tokens()
def __repr__(self) -> str:
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
#
@@ -389,6 +637,7 @@ class Metadata:
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
# print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
@@ -397,17 +646,16 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
class Tensor(ABC):
ndarray: NDArray
data_type: DataType
@abstractmethod
def astype(self, data_type: DataType) -> Self: ...
def astype(self, data_type: DataType) -> Tensor: ...
@abstractmethod
def permute(self, n_head: int, n_head_kv: int) -> Self: ...
def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
@abstractmethod
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
@abstractmethod
def part(self, n_part: int) -> Self: ...
def part(self, n_part: int) -> UnquantizedTensor: ...
@abstractmethod
def to_ggml(self) -> GGMLCompatibleTensor: ...
@@ -424,13 +672,13 @@ class UnquantizedTensor(Tensor):
self.ndarray = ndarray
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
def astype(self, data_type: DataType) -> UnquantizedTensor:
def astype(self, data_type: DataType) -> Tensor:
dtype = data_type.dtype
if self.data_type == DT_BF16:
self.ndarray = bf16_to_fp32(self.ndarray)
return UnquantizedTensor(self.ndarray.astype(dtype))
def to_ggml(self) -> Self:
def to_ggml(self) -> UnquantizedTensor:
return self
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
@@ -580,15 +828,6 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor:
def load() -> Tensor:
tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors]
return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors]))
s = lazy_tensors[0].shape.copy()
s.insert(0, len(lazy_tensors))
return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors))
# Functionality that simulates `torch.load` but where individual tensors are
# only loaded into memory on demand, not all at once.
# PyTorch can't do this natively as of time of writing:
@@ -649,7 +888,7 @@ class LazyUnpickler(pickle.Unpickler):
def rebuild_from_type_v2(func, new_type, args, state):
return func(*args)
CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = {
CLASSES = {
# getattr used here as a workaround for mypy not being smart enough to determine
# the staticmethods have a __func__ attribute.
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
@@ -778,12 +1017,12 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False)
# Check for a vocab size mismatch
if params.n_vocab == vocab.vocab_size:
logger.warning("Ignoring added_tokens.json since model matches vocab size without it.")
print("Ignoring added_tokens.json since model matches vocab size without it.")
return
if pad_vocab and params.n_vocab > vocab.vocab_size:
pad_count = params.n_vocab - vocab.vocab_size
logger.debug(
print(
f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
)
for i in range(1, pad_count + 1):
@@ -805,42 +1044,21 @@ class OutputFile:
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
def add_meta_model(self, params: Params, metadata: Metadata) -> None:
# Metadata About The Model And Its Provenence
name = "LLaMA"
if metadata is not None and metadata.name is not None:
name = metadata.name
elif params.path_model is not None:
name = params.path_model.name
elif params.n_ctx == 4096:
# Heuristic detection of LLaMA v2 model
name = "LLaMA v2"
self.gguf.add_name(name)
if metadata is not None:
if metadata.author is not None:
self.gguf.add_author(metadata.author)
if metadata.version is not None:
self.gguf.add_version(metadata.version)
if metadata.url is not None:
self.gguf.add_url(metadata.url)
if metadata.description is not None:
self.gguf.add_description(metadata.description)
if metadata.licence is not None:
self.gguf.add_licence(metadata.licence)
if metadata.source_url is not None:
self.gguf.add_source_url(metadata.source_url)
if metadata.source_hf_repo is not None:
self.gguf.add_source_hf_repo(metadata.source_hf_repo)
def add_meta_arch(self, params: Params) -> None:
# Metadata About The Neural Architecture Itself
self.gguf.add_vocab_size(params.n_vocab)
self.gguf.add_context_length(params.n_ctx)
self.gguf.add_embedding_length(params.n_embd)
self.gguf.add_block_count(params.n_layer)
self.gguf.add_feed_forward_length(params.n_ff)
name = "LLaMA"
# TODO: better logic to determine model name
if params.n_ctx == 4096:
name = "LLaMA v2"
elif params.path_model is not None:
name = str(params.path_model.parent).split('/')[-1]
self.gguf.add_name (name)
self.gguf.add_vocab_size (params.n_vocab)
self.gguf.add_context_length (params.n_ctx)
self.gguf.add_embedding_length (params.n_embd)
self.gguf.add_block_count (params.n_layer)
self.gguf.add_feed_forward_length (params.n_ff)
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
self.gguf.add_head_count (params.n_head)
self.gguf.add_head_count_kv (params.n_head_kv)
@@ -932,7 +1150,7 @@ class OutputFile:
elapsed = time.time() - start
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model)))
logger.info(
print(
f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
)
self.gguf.write_tensor_data(ndarray)
@@ -943,14 +1161,13 @@ class OutputFile:
@staticmethod
def write_vocab_only(
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
of = OutputFile(fname_out, endianess=endianess)
# meta data
of.add_meta_model(params, metadata)
of.add_meta_arch(params)
of.add_meta_vocab(vocab)
of.add_meta_special_vocab(svocab)
@@ -977,14 +1194,12 @@ class OutputFile:
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
metadata: Metadata = None,
) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
of = OutputFile(fname_out, endianess=endianess)
# meta data
of.add_meta_model(params, metadata)
of.add_meta_arch(params)
if isinstance(vocab, Vocab):
of.add_meta_vocab(vocab)
@@ -1020,37 +1235,6 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
raise ValueError(f"Unexpected combination of types: {name_to_type}")
def model_parameter_count(model: LazyModel) -> int:
total_model_parameters = 0
for i, (name, lazy_tensor) in enumerate(model.items()):
sum_weights_in_tensor = 1
for dim in lazy_tensor.shape:
sum_weights_in_tensor *= dim
total_model_parameters += sum_weights_in_tensor
return total_model_parameters
def model_parameter_count_rounded_notation(model_params_count: int) -> str:
if model_params_count > 1e12 :
# Trillions Of Parameters
scaled_model_params = model_params_count * 1e-12
scale_suffix = "T"
elif model_params_count > 1e9 :
# Billions Of Parameters
scaled_model_params = model_params_count * 1e-9
scale_suffix = "B"
elif model_params_count > 1e6 :
# Millions Of Parameters
scaled_model_params = model_params_count * 1e-6
scale_suffix = "M"
else:
# Thousands Of Parameters
scaled_model_params = model_params_count * 1e-3
scale_suffix = "K"
return f"{round(scaled_model_params)}{scale_suffix}"
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
for (name, tensor) in model.items()}
@@ -1062,31 +1246,15 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
tmp = model
# merge experts into one tensor
if params.n_experts and params.n_experts > 0:
for i_l in range(params.n_layer):
for w in range(1, 4):
experts = []
for e in range(params.n_experts):
if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model:
experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"])
del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]
elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model:
experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
else:
raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)
# HF models permut or pack some of the tensors, so we need to undo that
for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
logger.debug(f"Permuting layer {i}")
print(f"Permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
# tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
logger.debug(f"Unpacking and permuting layer {i}")
print(f"Unpacking and permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
@@ -1099,15 +1267,15 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
if name_new is None:
if skip_unknown:
logger.warning(f"Unexpected tensor name: {name} - skipping")
print(f"Unexpected tensor name: {name} - skipping")
continue
raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
if tensor_type in should_skip:
logger.debug(f"skipping tensor {name_new}")
print(f"skipping tensor {name_new}")
continue
logger.debug(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
out[name_new] = lazy_tensor
return out
@@ -1157,7 +1325,7 @@ def load_some_model(path: Path) -> ModelPlus:
# Be extra-friendly and accept either a file or a directory:
if path.is_dir():
# Check if it's a set of safetensors files first
globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
globs = ["model-00001-of-*.safetensors", "model.safetensors"]
files = [file for glob in globs for file in path.glob(glob)]
if not files:
# Try the PyTorch patterns too, with lower priority
@@ -1172,7 +1340,7 @@ def load_some_model(path: Path) -> ModelPlus:
paths = find_multifile_paths(path)
models_plus: list[ModelPlus] = []
for path in paths:
logger.info(f"Loading model file {path}")
print(f"Loading model file {path}")
models_plus.append(lazy_load_file(path))
model_plus = merge_multifile_models(models_plus)
@@ -1213,7 +1381,7 @@ class VocabFactory:
else:
raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
logger.info(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
return vocab
def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
@@ -1230,49 +1398,27 @@ class VocabFactory:
return vocab, special_vocab
def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
quantization = {
GGMLFileType.AllF32: "F32",
GGMLFileType.MostlyF16: "F16",
GGMLFileType.MostlyQ8_0: "Q8_0",
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
namestr = {
GGMLFileType.AllF32: "f32",
GGMLFileType.MostlyF16: "f16",
GGMLFileType.MostlyQ8_0:"q8_0",
}[file_type]
parameters = model_parameter_count_rounded_notation(model_params_count)
expert_count = ""
if params.n_experts is not None:
expert_count = f"{params.n_experts}x"
version = ""
if metadata is not None and metadata.version is not None:
version = f"-{metadata.version}"
name = "ggml-model"
if metadata is not None and metadata.name is not None:
name = metadata.name
elif params.path_model is not None:
name = params.path_model.name
return f"{name}{version}-{expert_count}{parameters}-{quantization}"
def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
ret = model_paths[0].parent / f"{default_filename}.gguf"
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
if ret in model_paths:
logger.error(
sys.stderr.write(
f"Error: Default output path ({ret}) would overwrite the input. "
"Please explicitly specify a path using --outfile.")
"Please explicitly specify a path using --outfile.\n")
sys.exit(1)
return ret
def do_dump_model(model_plus: ModelPlus) -> None:
print(f"model_plus.paths = {model_plus.paths!r}") # noqa: NP100
print(f"model_plus.format = {model_plus.format!r}") # noqa: NP100
print(f"model_plus.vocab = {model_plus.vocab!r}") # noqa: NP100
print(f"model_plus.paths = {model_plus.paths!r}")
print(f"model_plus.format = {model_plus.format!r}")
print(f"model_plus.vocab = {model_plus.vocab!r}")
for name, lazy_tensor in model_plus.model.items():
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
def main(args_in: list[str] | None = None) -> None:
@@ -1295,31 +1441,8 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file")
parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name")
args = parser.parse_args(args_in)
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
elif args.dump_single or args.dump or args.get_outfile:
# Avoid printing anything besides the dump output
logging.basicConfig(level=logging.WARNING)
else:
logging.basicConfig(level=logging.INFO)
metadata = Metadata.load(args.metadata)
if args.get_outfile:
model_plus = load_some_model(args.model)
params = Params.load(model_plus)
model = convert_model_names(model_plus.model, params, args.skip_unknown)
model_params_count = model_parameter_count(model_plus.model)
ftype = pick_output_type(model, args.outtype)
print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
return
if args.no_vocab and args.vocab_only:
raise ValueError("--vocab-only does not make sense with --no-vocab")
@@ -1333,38 +1456,32 @@ def main(args_in: list[str] | None = None) -> None:
else:
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
model_params_count = model_parameter_count(model_plus.model)
logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
if args.dump:
do_dump_model(model_plus)
return
endianess = gguf.GGUFEndian.LITTLE
if args.big_endian:
endianess = gguf.GGUFEndian.BIG
params = None
if args.pad_vocab or not args.vocab_only:
params = Params.load(model_plus)
if params.n_ctx == -1:
if args.ctx is None:
msg = """\
The model doesn't have a context size, and you didn't specify one with --ctx
Please specify one with --ctx:
- LLaMA v1: --ctx 2048
- LLaMA v2: --ctx 4096"""
parser.error(textwrap.dedent(msg))
params.n_ctx = args.ctx
params = Params.load(model_plus)
if params.n_ctx == -1:
if args.ctx is None:
msg = """\
The model doesn't have a context size, and you didn't specify one with --ctx
Please specify one with --ctx:
- LLaMA v1: --ctx 2048
- LLaMA v2: --ctx 4096"""
parser.error(textwrap.dedent(msg))
params.n_ctx = args.ctx
if args.outtype:
params.ftype = {
"f32": GGMLFileType.AllF32,
"f16": GGMLFileType.MostlyF16,
"q8_0": GGMLFileType.MostlyQ8_0,
}[args.outtype]
if args.outtype:
params.ftype = {
"f32": GGMLFileType.AllF32,
"f16": GGMLFileType.MostlyF16,
"q8_0": GGMLFileType.MostlyQ8_0,
}[args.outtype]
logger.info(f"params = {params}")
print(f"params = {params}")
model_parent_path = model_plus.paths[0].parent
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
@@ -1377,39 +1494,29 @@ def main(args_in: list[str] | None = None) -> None:
if not args.outfile:
raise ValueError("need --outfile if using --vocab-only")
outfile = args.outfile
if params is None:
params = Params(
n_vocab = vocab.vocab_size,
n_embd = 1,
n_layer = 1,
n_ctx = 1,
n_ff = 1,
n_head = 1,
n_head_kv = 1,
f_norm_eps = 1e-5,
)
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
logger.info(f"Wrote {outfile}")
endianess=endianess, pad_vocab=args.pad_vocab)
print(f"Wrote {outfile}")
return
if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
vocab = model_plus.vocab
logger.info(f"Vocab info: {vocab}")
logger.info(f"Special vocab info: {special_vocab}")
print(f"Vocab info: {vocab}")
print(f"Special vocab info: {special_vocab}")
model = model_plus.model
model = convert_model_names(model, params, args.skip_unknown)
ftype = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, ftype)
outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
params.ftype = ftype
logger.info(f"Writing {outfile}, format {ftype}")
print(f"Writing {outfile}, format {ftype}")
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
logger.info(f"Wrote {outfile}")
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
print(f"Wrote {outfile}")
if __name__ == '__main__':

View File

@@ -23,7 +23,7 @@ Install BLIS:
sudo make install
```
We recommend using openmp since it's easier to modify the cores being used.
We recommend using openmp since it's easier to modify the cores been used.
### llama.cpp compilation

View File

@@ -1,119 +0,0 @@
## Add a new model architecture to `llama.cpp`
Adding a model requires few steps:
1. Convert the model to GGUF
2. Define the model architecture in `llama.cpp`
3. Build the GGML graph implementation
After following these steps, you can open PR.
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
- [main](../examples/main)
- [imatrix](../examples/imatrix)
- [quantize](../examples/quantize)
- [server](../examples/server)
### 1. Convert the model to GGUF
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
The required steps to implement for an HF model are:
1. Define the model `Model.register` annotation in a new `Model` subclass, example:
```python
@Model.register("MyModelForCausalLM")
class MyModel(Model):
model_arch = gguf.MODEL_ARCH.GROK
```
2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
Example for `falcon` model:
```python
MODEL_ARCH.FALCON: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_NORM_2,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
]
```
3. Map the original tensor names to the standardize equivalent in GGUF
As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
Example for the normalization tensor in attention layers:
```python
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
# Attention norm
MODEL_TENSOR.ATTN_NORM: (
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen
"transformer.blocks.{bid}.norm_1", # mpt
...
)
}
```
`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
- `Model#set_gguf_parameters`
- `Model#set_vocab`
- `Model#write_tensors`
NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
### 2. Define the model architecture in `llama.cpp`
The model params and tensors layout must be defined in `llama.cpp`:
1. Define a new `llm_arch`
2. Define the tensors layout in `LLM_TENSOR_NAMES`
3. Add any non standard metadata in `llm_load_hparams`
4. Create the tensors for inference in `llm_load_tensors`
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
### 3. Build the GGML graph implementation
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
## GGUF specification
https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
## Resources
- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948

View File

@@ -1,104 +0,0 @@
# Debugging Tests Tips
## How to run & execute or debug a specific test without anything else to keep the feedback loop short?
There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
For example, running the following command will output an interactive list from which you can select a test. It takes this form:
`debug-test.sh [OPTION]... <test_regex> <test_number>`
It will then build & run in the debugger for you.
To just execute a test and get back a PASS or FAIL message run:
```bash
./scripts/debug-test.sh test-tokenizer
```
To test in GDB use the `-g` flag to enable gdb test mode.
```bash
./scripts/debug-test.sh -g test-tokenizer
# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
>>> b main
```
To speed up the testing loop, if you know your test number you can just run it similar to below:
```bash
./scripts/debug-test.sh test 23
```
For further reference use `debug-test.sh -h` to print help.
&nbsp;
### How does the script work?
If you want to be able to use the concepts contained in the script separately, the important ones are briefly outlined below.
#### Step 1: Reset and Setup folder context
From base of this repository, let's create `build-ci-debug` as our build context.
```bash
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
```
#### Step 2: Setup Build Environment and Compile Test Binaries
Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.
```bash
cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
make -j
```
#### Step 3: Find all tests available that matches REGEX
The output of this command will give you the command & arguments needed to run GDB.
* `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
* `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
* `-V` : Verbose Mode
```bash
ctest -R "test-tokenizer" -V -N
```
This may return output similar to below (focusing on key lines to pay attention to):
```bash
...
1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
1: Working Directory: .
Labels: main
Test #1: test-tokenizer-0-llama-spm
...
4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf"
4: Working Directory: .
Labels: main
Test #4: test-tokenizer-0-falcon
...
```
#### Step 4: Identify Test Command for Debugging
So for test #1 above we can tell these two pieces of relevant information:
* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
#### Step 5: Run GDB on test command
Based on the ctest 'test command' report above we can then run a gdb session via this command below:
```bash
gdb --args ${Test Binary} ${Test GGUF Model}
```
Example:
```bash
gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
```

View File

@@ -19,7 +19,6 @@ else()
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
add_subdirectory(eval-callback)
add_subdirectory(finetune)
add_subdirectory(gritlm)
add_subdirectory(gguf-split)
@@ -49,7 +48,4 @@ else()
add_subdirectory(server)
endif()
add_subdirectory(export-lora)
if (LLAMA_RPC)
add_subdirectory(rpc)
endif()
endif()

View File

@@ -10,16 +10,16 @@ There are 2 modes of operation:
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
```bash
./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
# custom set of batches
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
```
## Sample results

View File

@@ -32,16 +32,13 @@ int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
printf(" example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
return 1 ;
}
int n_kv_max = 2048;
int n_batch = 2048;
int n_ubatch = 512;
bool flash_attn = false;
int is_pp_shared = 0;
int n_gpu_layers = 0;
@@ -59,35 +56,23 @@ int main(int argc, char ** argv) {
}
if (argc >= 4) {
n_batch = std::atoi(argv[3]);
is_pp_shared = std::atoi(argv[3]);
}
if (argc >= 5) {
n_ubatch = std::atoi(argv[4]);
n_gpu_layers = std::atoi(argv[4]);
}
if (argc >= 6) {
flash_attn = std::atoi(argv[5]);
n_pp = parse_list(argv[5]);
}
if (argc >= 7) {
is_pp_shared = std::atoi(argv[6]);
n_tg = parse_list(argv[6]);
}
if (argc >= 8) {
n_gpu_layers = std::atoi(argv[7]);
}
if (argc >= 9) {
n_pp = parse_list(argv[8]);
}
if (argc >= 10) {
n_tg = parse_list(argv[9]);
}
if (argc >= 11) {
n_pl = parse_list(argv[10]);
n_pl = parse_list(argv[7]);
}
// init LLM
@@ -113,11 +98,9 @@ int main(int argc, char ** argv) {
llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max;
ctx_params.n_batch = n_batch;
ctx_params.n_ubatch = n_ubatch;
ctx_params.flash_attn = flash_attn;
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max;
ctx_params.n_batch = 512;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -175,7 +158,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");

View File

@@ -153,7 +153,7 @@ while n_cur <= n_len {
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? -> mark the stream as finished
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
if new_token_id == llama_token_eos(model) || n_cur == n_len {
i_batch[i] = -1
// print("")
if n_parallel > 1 {
@@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
var result = [CChar](repeating: 0, count: 8)
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
if nTokens < 0 {
let actualTokensCount = -Int(nTokens)
result = .init(repeating: 0, count: actualTokensCount)
@@ -237,8 +237,7 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
model,
token,
&result,
Int32(result.count),
false
Int32(result.count)
)
assert(check == actualTokensCount)
} else {

View File

@@ -48,7 +48,7 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is";
}
string_process_escapes(params.prompt);
process_escapes(params.prompt);
// init LLM
@@ -191,8 +191,8 @@ int main(int argc, char ** argv) {
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of generation? -> mark the stream as finished
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
// is it an end of stream? -> mark the stream as finished
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
i_batch[i] = -1;
LOG_TEE("\n");
if (n_parallel > 1) {

View File

@@ -47,7 +47,7 @@ struct beam_search_callback_data {
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc.
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
}
// Function matching type llama_beam_search_callback_fn_t.

View File

@@ -2,7 +2,7 @@
This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
`$ make -j`

View File

@@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
params.samples_start_after_nl = false;
params.use_adam = true;
params.use_flash = false;
params.use_flash = true;
params.use_scratch = true;
// only adam

View File

@@ -49,12 +49,6 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
}
float * out = output + batch.seq_id[i][0] * n_embd;
//TODO: I would also add a parameter here to enable normalization or not.
/*fprintf(stdout, "unnormalized_embedding:");
for (int hh = 0; hh < n_embd; hh++) {
fprintf(stdout, "%9.6f ", embd[hh]);
}
fprintf(stdout, "\n");*/
llama_embd_normalize(embd, out, n_embd);
}
}
@@ -80,7 +74,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = string_random_prompt(rng);
params.prompt = gpt_random_prompt(rng);
}
llama_backend_init();
@@ -107,7 +101,7 @@ int main(int argc, char ** argv) {
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
fprintf(stderr, "%s\n", get_system_info(params).c_str());
}
// split the prompt into lines
@@ -129,12 +123,10 @@ int main(int argc, char ** argv) {
inputs.push_back(inp);
}
// check if the last token is SEP
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
// add eos if not present
for (auto & inp : inputs) {
if (inp.empty() || inp.back() != llama_token_sep(model)) {
fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
if (inp.empty() || inp.back() != llama_token_eos(model)) {
inp.push_back(llama_token_eos(model));
}
}
@@ -211,7 +203,6 @@ int main(int argc, char ** argv) {
// clean up
llama_print_timings(ctx);
llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();

View File

@@ -1,9 +0,0 @@
set(TARGET eval-callback)
add_executable(${TARGET} eval-callback.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TEST_TARGET test-eval-callback)
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)

View File

@@ -1,95 +0,0 @@
# llama.cpp/examples/eval-callback
A simple example which demonstrates how to use callback during the inference.
It simply prints to the console all operations and tensor data.
Usage:
```shell
eval-callback \
--hf-repo ggml-org/models \
--hf-file phi-2/ggml-model-q4_0.gguf \
--model phi-2-q4_0.gguf \
--prompt hello \
--seed 42 \
-ngl 33
```
Will print:
```shell
llm_load_tensors: offloaded 33/33 layers to GPU
...
llama_new_context_with_model: n_ctx = 512
...
llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB
llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB
llama_new_context_with_model: graph nodes = 1225
llama_new_context_with_model: graph splits = 2
ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
[
[
[ -0.0181, 0.0272, 0.0272, ...],
],
]
ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
[
[
[ -0.6989, 1.0636, 1.0636, ...],
],
]
ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
[
[
[ -0.1800, 0.2817, 0.2632, ...],
],
]
ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
[
[
[ -0.1863, 0.2970, 0.2604, ...],
],
]
ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
[
[
[ -1.1238, 1.2876, -1.8086, ...],
],
]
ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
],
]
ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
],
]
ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
],
]
ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
[ -0.3608, 0.5076, -1.8866, ...],
[ 1.7643, 0.0273, -2.1065, ...],
...
],
]
ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
[ -0.3608, 0.5076, -1.8866, ...],
[ 1.7643, 0.0273, -2.1065, ...],
...
],
]
```

View File

@@ -1,195 +0,0 @@
#include "common.h"
#include "llama.h"
#include "ggml.h"
#include <cstdio>
#include <random>
#include <string>
#include <tuple>
#include <vector>
/**
* This the arbitrary data which will be passed to each callback.
* Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
*/
struct callback_data {
std::vector<uint8_t> data;
};
static std::string ggml_ne_string(const ggml_tensor * t) {
std::string str;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
str += std::to_string(t->ne[i]);
if (i + 1 < GGML_MAX_DIMS) {
str += ", ";
}
}
return str;
}
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
GGML_ASSERT(n > 0);
float sum = 0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
printf(" [\n");
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
if (i2 == n && ne[2] > 2*n) {
printf(" ..., \n");
i2 = ne[2] - n;
}
printf(" [\n");
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
if (i1 == n && ne[1] > 2*n) {
printf(" ..., \n");
i1 = ne[1] - n;
}
printf(" [");
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
if (i0 == n && ne[0] > 2*n) {
printf("..., ");
i0 = ne[0] - n;
}
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v;
if (type == GGML_TYPE_F16) {
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) {
v = *(float *) &data[i];
} else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) {
v = (float) *(int16_t *) &data[i];
} else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) &data[i];
} else {
GGML_ASSERT(false);
}
printf("%12.4f", v);
sum += v;
if (i0 < ne[0] - 1) printf(", ");
}
printf("],\n");
}
printf(" ],\n");
}
printf(" ]\n");
printf(" sum = %f\n", sum);
}
}
/**
* GGML operations callback during the graph execution.
*
* @param t current tensor
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
* see ggml_backend_sched_eval_callback
* @param user_data user data to pass at each call back
* @return true to receive data or continue the graph, false otherwise
*/
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (callback_data *) user_data;
const struct ggml_tensor * src0 = t->src[0];
const struct ggml_tensor * src1 = t->src[1];
if (ask) {
return true; // Always retrieve data
}
char src1_str[128] = {0};
if (src1) {
sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
}
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
t->name, ggml_type_name(t->type), ggml_op_desc(t),
src0->name, ggml_ne_string(src0).c_str(),
src1 ? src1_str : "",
ggml_ne_string(t).c_str());
// copy the data from the GPU memory if needed
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
if (!is_host) {
auto n_bytes = ggml_nbytes(t);
cb_data->data.resize(n_bytes);
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
}
if (!ggml_is_quantized(t->type)) {
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
}
return true;
}
static bool run(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
return true;
}
int main(int argc, char ** argv) {
callback_data cb_data;
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}
print_build_info();
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = string_random_prompt(rng);
}
llama_backend_init();
llama_numa_init(params.numa);
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
params.cb_eval = ggml_debug;
params.cb_eval_user_data = &cb_data;
params.warmup = false;
// init
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
return 1;
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
}
bool OK = run(ctx, params);
if (!OK) {
return 1;
}
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
return 0;
}

View File

@@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
// not capturing these, to silcence warnings
const int rope_mode = 0;
return ggml_rope_ext(ctx,
t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
return ggml_rope_custom(ctx,
t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
);
};
@@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
} else if (a->type == GGML_TYPE_F32) {
return ggml_add(ctx, a, b);
@@ -643,8 +643,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
struct ggml_tensor * t16;
if (enable_flash_attn) {
GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
//t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
} else {
struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);

View File

@@ -1,5 +0,0 @@
set(TARGET gbnf-validator)
add_executable(${TARGET} gbnf-validator.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,132 +0,0 @@
#define LLAMA_API_INTERNAL
#include "grammar-parser.h"
#include "ggml.h"
#include "llama.h"
#include "unicode.h"
#include <cstdio>
#include <cstdlib>
#include <string>
#include <vector>
static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
auto decoded = decode_utf8(input_str, {});
const auto & code_points = decoded.first;
size_t pos = 0;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
if (grammar->stacks.empty()) {
error_pos = pos;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
grammar->stacks = prev_stacks;
return false;
}
++pos;
}
for (const auto & stack : grammar->stacks) {
if (stack.empty()) {
return true;
}
}
error_pos = pos;
error_msg = "Unexpected end of input";
return false;
}
static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) {
fprintf(stdout, "Input string is invalid according to the grammar.\n");
fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos);
fprintf(stdout, "\n");
fprintf(stdout, "Input string:\n");
fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str());
if (error_pos < input_str.size()) {
fprintf(stdout, "\033[1;31m%c", input_str[error_pos]);
if (error_pos+1 < input_str.size()) {
fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str());
}
fprintf(stdout, "\033[0m\n");
}
}
int main(int argc, char** argv) {
if (argc != 3) {
fprintf(stdout, "Usage: %s <grammar_filename> <input_filename>\n", argv[0]);
return 1;
}
const std::string grammar_filename = argv[1];
const std::string input_filename = argv[2];
// Read the GBNF grammar file
FILE* grammar_file = fopen(grammar_filename.c_str(), "r");
if (!grammar_file) {
fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str());
return 1;
}
fseek(grammar_file, 0, SEEK_END);
size_t grammar_size = ftell(grammar_file);
fseek(grammar_file, 0, SEEK_SET);
std::string grammar_str(grammar_size, ' ');
fread(&grammar_str[0], 1, grammar_size, grammar_file);
fclose(grammar_file);
// Parse the GBNF grammar
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
fprintf(stdout, "%s: failed to parse grammar\n", __func__);
return 1;
}
// Ensure that there is a "root" node.
if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
return 1;
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
// Create the LLAMA grammar
auto grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
// Read the input file
FILE* input_file = fopen(input_filename.c_str(), "r");
if (!input_file) {
fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str());
return 1;
}
fseek(input_file, 0, SEEK_END);
size_t input_size = ftell(input_file);
fseek(input_file, 0, SEEK_SET);
std::string input_str(input_size, ' ');
fread(&input_str[0], 1, input_size, input_file);
fclose(input_file);
// Validate the input string against the grammar
size_t error_pos;
std::string error_msg;
bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg);
if (is_valid) {
fprintf(stdout, "Input string is valid according to the grammar.\n");
} else {
print_error_message(input_str, error_pos, error_msg);
}
// Clean up
llama_grammar_free(grammar);
return 0;
}

View File

@@ -5,6 +5,5 @@ CLI to split / merge GGUF files.
**Command line options:**
- `--split`: split GGUF to multiple GGUF, default operation.
- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`.
- `--split-max-tensors`: maximum tensors in each split: default(128)
- `--merge`: merge multiple GGUF to a single GGUF.

View File

@@ -32,7 +32,6 @@ struct split_params {
int n_split_tensors = 128;
std::string input;
std::string output;
bool no_tensor_first_split = false;
bool dry_run = false;
};
@@ -50,7 +49,6 @@ static void split_print_usage(const char * executable) {
printf(" --merge merge multiple GGUF to a single GGUF\n");
printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors);
printf(" --split-max-size N(M|G) max size per split\n");
printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
printf(" --dry-run only print out a split plan and exit, without writing any new files\n");
printf("\n");
}
@@ -61,10 +59,10 @@ static size_t split_str_to_n_bytes(std::string str) {
int n;
if (str.back() == 'M') {
sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1024 * 1024; // megabytes
n_bytes = n * 1024 * 1024; // megabytes
} else if (str.back() == 'G') {
sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
n_bytes = n * 1024 * 1024 * 1024; // gigabytes
} else {
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
}
@@ -102,10 +100,6 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
arg_found = true;
params.dry_run = true;
}
if (arg == "--no-tensor-first-split") {
arg_found = true;
params.no_tensor_first_split = true;
}
if (is_op_set) {
throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
@@ -206,10 +200,10 @@ struct split_strategy {
// because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
int i_split = -1;
struct gguf_context * ctx_out = NULL;
auto new_ctx_out = [&](bool allow_no_tensors) {
auto new_ctx_out = [&]() {
i_split++;
if (ctx_out != NULL) {
if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) {
if (gguf_get_n_tensors(ctx_out) == 0) {
fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
exit(EXIT_FAILURE);
}
@@ -226,12 +220,7 @@ struct split_strategy {
};
// initialize ctx_out for the first split
new_ctx_out(false);
// skip first split if no_tensor_first_split is set
if (params.no_tensor_first_split) {
new_ctx_out(true);
}
new_ctx_out();
// process tensors one by one
size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
@@ -241,7 +230,7 @@ struct split_strategy {
size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
size_t next_tensors_size = curr_tensors_size + n_bytes;
if (should_split(i, next_tensors_size)) {
new_ctx_out(false);
new_ctx_out();
curr_tensors_size = n_bytes;
} else {
curr_tensors_size = next_tensors_size;

View File

@@ -1,89 +0,0 @@
#!/bin/bash
set -eu
if [ $# -lt 1 ]
then
echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
echo "example: $0 ../../build/bin ../../tmp"
exit 1
fi
if [ $# -gt 1 ]
then
TMP_DIR=$2
else
TMP_DIR=/tmp
fi
set -x
SPLIT=$1/gguf-split
MAIN=$1/main
WORK_PATH=$TMP_DIR/gguf-split
ROOT_DIR=$(realpath $(dirname $0)/../../)
mkdir -p "$WORK_PATH"
# Clean up in case of previously failed test
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
# 1. Get a model
(
cd $WORK_PATH
"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
)
echo PASS
# 2. Split with max tensors strategy
$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
echo PASS
echo
# 2b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
echo PASS
echo
# 3. Merge
$SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-merge.gguf
echo PASS
echo
# 3b. Test the merged model is loading properly
$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
echo PASS
echo
# 4. Split with no tensors in the first split
$SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
echo PASS
echo
# 4b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
echo PASS
echo
# 5. Merge
#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf
#echo PASS
#echo
# 5b. Test the merged model is loading properly
#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
#echo PASS
#echo
# 6. Split with size strategy
$SPLIT --split-max-size 2G $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-2G
echo PASS
echo
# 6b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
echo PASS
echo
# Clean up
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf

View File

@@ -142,7 +142,7 @@ static bool gguf_ex_read_0(const std::string & fname) {
}
// read and create ggml_context containing the tensors and their data
static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
static bool gguf_ex_read_1(const std::string & fname) {
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
@@ -206,7 +206,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
printf("\n\n");
// check data
if (check_data) {
{
const float * data = (const float *) cur->data;
for (int j = 0; j < ggml_nelements(cur); ++j) {
if (data[j] != 100 + i) {
@@ -229,16 +229,9 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
int main(int argc, char ** argv) {
if (argc < 3) {
printf("usage: %s data.gguf r|w [n]\n", argv[0]);
printf("r: read data.gguf file\n");
printf("w: write data.gguf file\n");
printf("n: no check of tensor data\n");
printf("usage: %s data.gguf r|w\n", argv[0]);
return -1;
}
bool check_data = true;
if (argc == 4) {
check_data = false;
}
const std::string fname(argv[1]);
const std::string mode (argv[2]);
@@ -249,7 +242,7 @@ int main(int argc, char ** argv) {
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
} else if (mode == "r") {
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_1(fname, check_data) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
}
return 0;

View File

@@ -21,12 +21,12 @@ not have to be performed at all.
### Running the example
Download a Grit model:
```console
$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models
$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
```
Run the example using the downloaded model:
```console
$ ./gritlm -m models/gritlm-7b_q4_1.gguf
$ ./gritlm -m gritlm-7b_q4_1.gguf
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103

View File

@@ -19,12 +19,10 @@
struct Stats {
std::vector<float> values;
std::vector<int> counts;
int ncall = 0;
};
struct StatParams {
std::string dataset;
std::string ofile = "imatrix.dat";
int n_output_frequency = 10;
int verbosity = 1;
@@ -46,9 +44,9 @@ private:
std::mutex m_mutex;
int m_last_call = 0;
std::vector<float> m_src1_data;
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
std::vector<int> m_ids; // the expert ids from ggml_mul_mat_id
//
void save_imatrix(const char * file_name, const char * dataset) const;
void save_imatrix(const char * file_name) const;
void keep_imatrix(int ncall) const;
};
@@ -83,7 +81,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (ask) {
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
if (t->op != GGML_OP_MUL_MAT) return false;
// why are small batches ignored (<16 tokens)?
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
return true;
@@ -101,59 +98,45 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
// this has been adapted to the new format of storing merged experts in a single 3d tensor
// ref: https://github.com/ggerganov/llama.cpp/pull/6387
if (t->op == GGML_OP_MUL_MAT_ID) {
// ids -> [n_experts_used, n_tokens]
// src1 -> [cols, n_expert_used, n_tokens]
const ggml_tensor * ids = t->src[2];
const int n_as = src0->ne[2];
const int n_ids = ids->ne[0];
const int idx = ((int32_t *) t->op_params)[0];
const int n_as = ((int32_t *) t->op_params)[1];
// the top-k selected expert ids are stored in the ids tensor
// for simplicity, always copy ids to host, because it is small
// take into account that ids is not contiguous!
// the top-k selected expert ids are stored in the src0 tensor
// for simplicity, always copy src0 to host, because it is small
// take into account that src0 is not contiguous!
GGML_ASSERT(src0->ne[1] == src1->ne[1]);
GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int)));
m_ids.resize(ggml_nbytes(src0)/sizeof(int));
ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
GGML_ASSERT(ids->ne[1] == src1->ne[2]);
m_ids.resize(ggml_nbytes(ids));
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
auto & e = m_stats[wname];
++e.ncall;
if (e.values.empty()) {
e.values.resize(src1->ne[0]*n_as, 0);
e.counts.resize(src1->ne[0]*n_as, 0);
}
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
exit(1); //GGML_ASSERT(false);
}
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
}
// loop over all possible experts, regardless if they are used or not in the batch
// this is necessary to guarantee equal number of "ncall" for each tensor
for (int ex = 0; ex < n_as; ++ex) {
size_t e_start = ex*src1->ne[0];
for (int idx = 0; idx < n_ids; ++idx) {
for (int row = 0; row < (int)src1->ne[2]; ++row) {
const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);
GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
if (excur != ex) continue;
const int64_t i11 = idx % src1->ne[1];
const int64_t i12 = row;
const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++;
}
src0 = t->src[2 + ex];
wname = filter_tensor_name(src0->name);
auto& e = m_stats[wname];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ASSERT(false);
}
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
// using the following line, we can correct for that if needed
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
++e.ncall;
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
}
for (int row = 0; row < (int)src1->ne[1]; ++row) {
const int excur = m_ids[row*n_as + idx];
GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
if (excur != ex) continue;
const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
}
}
if (e.ncall > m_last_call) {
@@ -170,7 +153,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
auto& e = m_stats[wname];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
e.counts.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
@@ -184,7 +166,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
e.counts[j]++;
}
}
if (e.ncall > m_last_call) {
@@ -202,7 +183,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
void IMatrixCollector::save_imatrix() const {
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
}
void IMatrixCollector::keep_imatrix(int ncall) const {
@@ -210,39 +191,24 @@ void IMatrixCollector::keep_imatrix(int ncall) const {
if (file_name.empty()) file_name = "imatrix.dat";
file_name += ".at_";
file_name += std::to_string(ncall);
save_imatrix(file_name.c_str(), m_params.dataset.c_str());
save_imatrix(file_name.c_str());
}
void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
void IMatrixCollector::save_imatrix(const char * fname) const {
std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size();
out.write((const char *) &n_entries, sizeof(n_entries));
for (const auto & p : m_stats) {
out.write((const char*)&n_entries, sizeof(n_entries));
for (auto& p : m_stats) {
int len = p.first.size();
out.write((const char *) &len, sizeof(len));
out.write((const char*)&len, sizeof(len));
out.write(p.first.c_str(), len);
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
int nval = p.second.values.size();
out.write((const char *) &nval, sizeof(nval));
if (nval > 0) {
std::vector<float> tmp(nval);
for (int i = 0; i < nval; i++) {
tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
}
out.write((const char*)tmp.data(), nval*sizeof(float));
}
out.write((const char*)&nval, sizeof(nval));
if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
}
// Write the number of call the matrix was computed with
out.write((const char *) &m_last_call, sizeof(m_last_call));
// Write the dataset name at the end of the file to later on specify it in quantize
int n_dataset = strlen(dataset);
out.write((const char *) &n_dataset, sizeof(n_dataset));
out.write(dataset, n_dataset);
if (m_params.verbosity > 0) {
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
}
}
@@ -278,28 +244,14 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
imatrix_data = {};
return false;
}
// When re-called from load_imatrix() with add set, this will already be created.
if (e.values.empty()) {
e.values.resize(nval, 0);
e.counts.resize(nval, 0);
}
std::vector<float> tmp(nval);
in.read((char*)tmp.data(), nval*sizeof(float));
e.values.resize(nval);
in.read((char*)e.values.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i);
imatrix_data = {};
return false;
}
// Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
for (int i = 0; i < nval; i++) {
e.values[i] += tmp[i];
e.counts[i] += ncall;
}
e.ncall += ncall;
e.ncall = ncall;
}
return true;
}
@@ -394,13 +346,12 @@ static void process_logits(
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const int n_ctx = llama_n_ctx(ctx);
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@@ -579,29 +530,6 @@ int main(int argc, char ** argv) {
}
}
gpt_params params;
params.n_batch = 512;
if (!gpt_params_parse(args.size(), args.data(), params)) {
return 1;
}
params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = string_random_prompt(rng);
}
sparams.dataset = params.prompt_file;
g_collector.set_parameters(std::move(sparams));
if (!combine_files.empty()) {
@@ -640,21 +568,49 @@ int main(int argc, char ** argv) {
}
}
gpt_params params;
params.n_batch = 512;
if (!gpt_params_parse(args.size(), args.data(), params)) {
return 1;
}
params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_backend_init();
llama_numa_init(params.numa);
llama_model_params mparams = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
llama_context_params cparams = llama_context_params_from_gpt_params(params);
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
params.cb_eval = ik_collect_imatrix;
params.cb_eval_user_data = NULL;
params.warmup = false;
cparams.cb_eval = ik_collect_imatrix;
cparams.cb_eval_user_data = NULL;
// init
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
llama_context * ctx = llama_new_context_with_model(model, cparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: unable to create context\n", __func__);
return 1;
}
@@ -667,7 +623,7 @@ int main(int argc, char ** argv) {
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
fprintf(stderr, "%s\n", get_system_info(params).c_str());
}
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);

View File

@@ -36,11 +36,6 @@ The `infill` program offers a seamless way to interact with LLaMA models, allowi
### Example
Download a model that supports infill, for example CodeLlama:
```console
scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
```
```bash
./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
```

View File

@@ -50,9 +50,9 @@ static void write_logfile(
return;
}
const std::string timestamp = string_get_sortable_timestamp();
const std::string timestamp = get_sortable_timestamp();
const bool success = fs_create_directory_with_parents(params.logdir);
const bool success = create_directory_with_parents(params.logdir);
if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str());
@@ -70,7 +70,7 @@ static void write_logfile(
fprintf(logfile, "binary: infill\n");
char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc));
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
fprintf(logfile, "\n");
fprintf(logfile, "######################\n");
@@ -78,8 +78,8 @@ static void write_logfile(
fprintf(logfile, "######################\n");
fprintf(logfile, "\n");
yaml_dump_string_multiline(logfile, "output", output.c_str());
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
dump_string_yaml_multiline(logfile, "output", output.c_str());
dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile);
@@ -236,10 +236,9 @@ int main(int argc, char ** argv) {
// print system information
{
LOG_TEE("\n");
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
LOG_TEE("%s\n", get_system_info(params).c_str());
}
const bool add_bos = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1);
LOG("add_bos: %d\n", add_bos);
bool suff_rm_leading_spc = params.escape;
@@ -280,10 +279,10 @@ int main(int argc, char ** argv) {
if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
original_prompt_len = original_inp.size();
@@ -586,7 +585,7 @@ int main(int argc, char ** argv) {
// deal with eot token in infill mode
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
if (is_interacting && !params.interactive_first) {
if(is_interacting && !params.interactive_first) {
// print an eot token
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
}
@@ -621,8 +620,8 @@ int main(int argc, char ** argv) {
if (params.escape) {
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
string_process_escapes(params.input_prefix);
string_process_escapes(params.input_suffix);
process_escapes(params.input_prefix);
process_escapes(params.input_suffix);
}
suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
@@ -651,8 +650,8 @@ int main(int argc, char ** argv) {
// LOG_TEE("took new input\n");
is_interacting = false;
}
// deal with end of generation tokens in interactive mode
else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
// deal with end of text token in interactive mode
else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
LOG("found EOS token\n");
if (params.interactive) {
@@ -731,8 +730,8 @@ int main(int argc, char ** argv) {
}
}
// end of generation
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
// end of text token
if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
break;
}

View File

@@ -6,94 +6,37 @@ import re
import sys
from typing import Any, Dict, List, Set, Tuple, Union
def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
if not separator_rule:
if min_items == 0 and max_items == 1:
return f'{item_rule}?'
elif min_items == 1 and max_items is None:
return f'{item_rule}+'
result = ''
if min_items > 0:
if item_rule_is_literal and separator_rule is None:
result = '"' + (item_rule[1:-1] * min_items) + '"'
else:
result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
def opt_repetitions(up_to_n, prefix_with_sep=False):
'''
- n=4, no sep: '(a (a (a (a)?)?)?)?'
- n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?'
- n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
'''
content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
if up_to_n == 0:
return ''
elif up_to_n == 1:
return f'({content})?'
elif separator_rule and not prefix_with_sep:
return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
else:
return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
if min_items > 0 and max_items != min_items:
result += ' '
if max_items is not None:
result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
else:
item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
if min_items == 0 and separator_rule:
result = f'({item_rule} {item_operator}*)?'
else:
result += f'{item_operator}*'
return result
class BuiltinRule:
def __init__(self, content: str, deps: list = None):
self.content = content
self.deps = deps or []
_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
# whitespace is constrained to a single space char to prevent model "running away" in
# whitespace. Also maybe improves generation quality?
SPACE_RULE = '" "?'
PRIMITIVE_RULES = {
'boolean' : BuiltinRule('("true" | "false") space', []),
'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']),
'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
'uuid' : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
'null' : BuiltinRule('"null" space', []),
'boolean': '("true" | "false") space',
'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
'value' : 'object | array | string | number | boolean',
'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
'array' : '"[" space ( value ("," space value)* )? "]" space',
'uuid' : '"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space',
'string': r''' "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space''',
'null': '"null" space',
}
OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value']
# TODO: support "uri", "email" string formats
STRING_FORMAT_RULES = {
'date' : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date-time' : BuiltinRule('date "T" time', ['date', 'time']),
'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']),
'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']),
'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
DATE_RULES = {
'date' : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
'time' : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
'date-time': 'date "T" time',
'date-string': '"\\"" date "\\"" space',
'time-string': '"\\"" time "\\"" space',
'date-time-string': '"\\"" date-time "\\"" space',
}
DOTALL = '[\\U00000000-\\U0010FFFF]'
DOT = '[^\\x0A\\x0D]'
RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])
RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *DATE_RULES.keys()])
INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
@@ -103,6 +46,8 @@ GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']'
NON_LITERAL_SET = set('|.()[]{}*+?')
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits
class SchemaConverter:
def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
@@ -110,9 +55,7 @@ class SchemaConverter:
self._allow_fetch = allow_fetch
self._dotall = dotall
self._raw_pattern = raw_pattern
self._rules = {
'space': SPACE_RULE,
}
self._rules = {'space': SPACE_RULE}
self._refs = {}
self._refs_being_resolved = set()
@@ -122,29 +65,6 @@ class SchemaConverter:
)
return f'"{escaped}"'
def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str:
'''
not_literal('a') -> '[^a]'
not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
'''
assert len(literal) > 0, 'Empty literal not supported'
def recurse(i: int):
c = literal[i]
if maybe_escaped_underscores and c == '_':
yield f'[^{c}\\\\]'
yield ' | '
yield f'"\\\\"? "{c}"'
else:
yield f'[^{c}]'
if i < len(literal) - 1:
yield ' | '
yield self._format_literal(c)
yield ' ('
yield from recurse(i + 1)
yield ')?'
return ''.join(('(', *recurse(0), ')'))
def _add_rule(self, name, rule):
esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
if esc_name not in self._rules or self._rules[esc_name] == rule:
@@ -249,10 +169,10 @@ class SchemaConverter:
def get_dot():
if self._dotall:
rule = DOTALL
rule = '[\\U00000000-\\U0010FFFF]'
else:
# Accept any character... except \n and \r line break chars (\x0A and \xOD)
rule = DOT
rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]'
return self._add_rule(f'dot', rule)
def join_seq():
@@ -326,14 +246,26 @@ class SchemaConverter:
(sub, sub_is_literal) = seq[-1]
if not sub_is_literal:
id = sub_rule_ids.get(sub)
if id is None:
id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
sub_rule_ids[sub] = id
sub = id
if min_times == 0 and max_times is None:
seq[-1] = (f'{sub}*', False)
elif min_times == 0 and max_times == 1:
seq[-1] = (f'{sub}?', False)
elif min_times == 1 and max_times is None:
seq[-1] = (f'{sub}+', False)
else:
if not sub_is_literal:
id = sub_rule_ids.get(sub)
if id is None:
id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
sub_rule_ids[sub] = id
sub = id
seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
seq[-1] = (
' '.join(
([f'"{sub[1:-1] * min_times}"'] if sub_is_literal else [sub] * min_times) +
([f'{sub}?'] * (max_times - min_times) if max_times is not None else [f'{sub}*'])),
False
)
else:
literal = ''
while i < length:
@@ -441,47 +373,49 @@ class SchemaConverter:
' "]" space')
else:
item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
list_item_operator = f'( "," space {item_rule_name} )'
successive_items = ""
min_items = schema.get("minItems", 0)
max_items = schema.get("maxItems")
return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
if min_items > 0:
successive_items = list_item_operator * (min_items - 1)
min_items -= 1
if max_items is not None and max_items > min_items:
successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
else:
successive_items += list_item_operator + "*"
if min_items == 0:
rule = f'"[" space ( {item_rule_name} {successive_items} )? "]" space'
else:
rule = f'"[" space {item_rule_name} {successive_items} "]" space'
return self._add_rule(rule_name, rule)
elif schema_type in (None, 'string') and 'pattern' in schema:
return self._visit_pattern(schema['pattern'], rule_name)
elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
return self._add_primitive(
return self._add_rule(
'root' if rule_name == 'root' else schema_format,
PRIMITIVE_RULES['uuid']
)
elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES:
prim_name = f'{schema_format}-string'
return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]))
elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema):
char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
min_len = schema.get('minLength', 0)
max_len = schema.get('maxLength')
return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
elif schema_type in (None, 'string') and schema_format in DATE_RULES:
for t, r in DATE_RULES.items():
self._add_rule(t, r)
return schema_format + '-string'
elif (schema_type == 'object') or (len(schema) == 0):
return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
for n in OBJECT_RULE_NAMES:
self._add_rule(n, PRIMITIVE_RULES[n])
return self._add_rule(rule_name, 'object')
else:
assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
# TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type])
def _add_primitive(self, name: str, rule: BuiltinRule):
n = self._add_rule(name, rule.content)
for dep in rule.deps:
dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
assert dep_rule, f'Rule {dep} not known'
if dep not in self._rules:
self._add_primitive(dep, dep_rule)
return n
return self._add_rule(
'root' if rule_name == 'root' else schema_type,
PRIMITIVE_RULES[schema_type]
)
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
prop_order = self._prop_order
@@ -503,7 +437,7 @@ class SchemaConverter:
value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
prop_kv_rule_names["*"] = self._add_rule(
f'{sub_name}-kv',
self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
self._add_rule('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
)
optional_props.append("*")

View File

@@ -26,21 +26,16 @@ options:
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128)
-pg <pp,tg> (default: 512,128)
-b, --batch-size <n> (default: 2048)
-ub, --ubatch-size <n> (default: 512)
-ctk, --cache-type-k <t> (default: f16)
-ctv, --cache-type-v <t> (default: f16)
-t, --threads <n> (default: 16)
-b, --batch-size <n> (default: 512)
-ctk <t>, --cache-type-k <t> (default: f16)
-ctv <t>, --cache-type-v <t> (default: f16)
-t, --threads <n> (default: 112)
-ngl, --n-gpu-layers <n> (default: 99)
-sm, --split-mode <none|layer|row> (default: layer)
-mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0)
-fa, --flash-attn <0|1> (default: 0)
-mmp, --mmap <0|1> (default: 1)
--numa <distribute|isolate|numactl> (default: disabled)
-embd, --embeddings <0|1> (default: 0)
-ts, --tensor-split <ts0/ts1/..> (default: 0)
-ts, --tensor_split <ts0/ts1/..> (default: 0)
-r, --repetitions <n> (default: 5)
-o, --output <csv|json|md|sql> (default: md)
-v, --verbose (default: 0)
@@ -48,11 +43,10 @@ options:
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
```
llama-bench can perform three types of tests:
llama-bench can perform two types of tests:
- Prompt processing (pp): processing a prompt in batches (`-p`)
- Text generation (tg): generating a sequence of tokens (`-n`)
- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`)
With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).

View File

@@ -161,32 +161,22 @@ static const char * split_mode_str(llama_split_mode mode) {
}
}
static std::string pair_str(const std::pair<int, int> & p) {
static char buf[32];
snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
return buf;
}
struct cmd_params {
std::vector<std::string> model;
std::vector<int> n_prompt;
std::vector<int> n_gen;
std::vector<std::pair<int, int>> n_pg;
std::vector<int> n_batch;
std::vector<int> n_ubatch;
std::vector<ggml_type> type_k;
std::vector<ggml_type> type_v;
std::vector<int> n_threads;
std::vector<int> n_gpu_layers;
std::vector<std::string> rpc_servers;
std::vector<llama_split_mode> split_mode;
std::vector<int> main_gpu;
std::vector<bool> no_kv_offload;
std::vector<bool> flash_attn;
std::vector<std::vector<float>> tensor_split;
std::vector<bool> use_mmap;
std::vector<bool> embeddings;
ggml_numa_strategy numa;
int reps;
bool verbose;
output_formats output_format;
@@ -196,22 +186,18 @@ static const cmd_params cmd_params_defaults = {
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
/* n_prompt */ {512},
/* n_gen */ {128},
/* n_pg */ {},
/* n_batch */ {2048},
/* n_ubatch */ {512},
/* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16},
/* n_threads */ {cpu_get_num_math()},
/* n_threads */ {get_num_physical_cores()},
/* n_gpu_layers */ {99},
/* rpc_servers */ {""},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0},
/* no_kv_offload */ {false},
/* flash_attn */ {false},
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* use_mmap */ {true},
/* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN
@@ -225,20 +211,16 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -ub N, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
@@ -316,17 +298,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split<int>(argv[i], split_delim);
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
} else if (arg == "-pg") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<std::string>(argv[i], ',');
if (p.size() != 2) {
invalid_param = true;
break;
}
params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
} else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) {
invalid_param = true;
@@ -387,12 +358,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
} else if (arg == "-rpc" || arg == "--rpc") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rpc_servers.push_back(argv[i]);
} else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) {
invalid_param = true;
@@ -428,24 +393,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split<bool>(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
} else if (arg == "--numa") {
if (++i >= argc) {
invalid_param = true;
break;
} else {
std::string value(argv[i]);
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
else { invalid_param = true; break; }
}
} else if (arg == "-fa" || arg == "--flash-attn") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
} else if (arg == "-mmp" || arg == "--mmap") {
if (++i >= argc) {
invalid_param = true;
@@ -522,17 +469,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.model.empty()) { params.model = cmd_params_defaults.model; }
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; }
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; }
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
@@ -551,11 +495,9 @@ struct cmd_params_instance {
ggml_type type_v;
int n_threads;
int n_gpu_layers;
std::string rpc_servers;
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool flash_attn;
std::vector<float> tensor_split;
bool use_mmap;
bool embeddings;
@@ -564,9 +506,6 @@ struct cmd_params_instance {
llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = n_gpu_layers;
if (!rpc_servers.empty()) {
mparams.rpc_servers = rpc_servers.c_str();
}
mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data();
@@ -578,7 +517,6 @@ struct cmd_params_instance {
bool equal_mparams(const cmd_params_instance & other) const {
return model == other.model &&
n_gpu_layers == other.n_gpu_layers &&
rpc_servers == other.rpc_servers &&
split_mode == other.split_mode &&
main_gpu == other.main_gpu &&
use_mmap == other.use_mmap &&
@@ -594,7 +532,6 @@ struct cmd_params_instance {
cparams.type_k = type_k;
cparams.type_v = type_v;
cparams.offload_kqv = !no_kv_offload;
cparams.flash_attn = flash_attn;
cparams.embeddings = embeddings;
return cparams;
@@ -607,7 +544,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
// this ordering minimizes the number of times that each model needs to be reloaded
for (const auto & m : params.model)
for (const auto & nl : params.n_gpu_layers)
for (const auto & rpc : params.rpc_servers)
for (const auto & sm : params.split_mode)
for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split)
@@ -618,7 +554,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v)
for (const auto & nkvo : params.no_kv_offload)
for (const auto & fa : params.flash_attn)
for (const auto & nt : params.n_threads) {
for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) {
@@ -634,11 +569,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
@@ -660,37 +593,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
};
instances.push_back(instance);
}
for (const auto & n_pg : params.n_pg) {
if (n_pg.first == 0 && n_pg.second == 0) {
continue;
}
cmd_params_instance instance = {
/* .model = */ m,
/* .n_prompt = */ n_pg.first,
/* .n_gen = */ n_pg.second,
/* .n_batch = */ nb,
/* .n_ubatch = */ nub,
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
@@ -711,7 +616,6 @@ struct test {
static const bool kompute;
static const bool metal;
static const bool sycl;
static const bool rpc;
static const bool gpu_blas;
static const bool blas;
static const std::string cpu_info;
@@ -729,7 +633,6 @@ struct test {
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool flash_attn;
std::vector<float> tensor_split;
bool use_mmap;
bool embeddings;
@@ -754,7 +657,6 @@ struct test {
split_mode = inst.split_mode;
main_gpu = inst.main_gpu;
no_kv_offload = inst.no_kv_offload;
flash_attn = inst.flash_attn;
tensor_split = inst.tensor_split;
use_mmap = inst.use_mmap;
embeddings = inst.embeddings;
@@ -810,9 +712,6 @@ struct test {
if (sycl) {
return GGML_SYCL_NAME;
}
if (rpc) {
return "RPC";
}
if (gpu_blas) {
return "GPU BLAS";
}
@@ -826,13 +725,13 @@ struct test {
static const std::vector<std::string> & get_fields() {
static const std::vector<std::string> fields = {
"build_commit", "build_number",
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_ubatch",
"n_threads", "type_k", "type_v",
"n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "flash_attn",
"main_gpu", "no_kv_offload",
"tensor_split", "use_mmap", "embeddings",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
@@ -854,7 +753,7 @@ struct test {
}
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
field == "use_mmap" || field == "embeddings") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts") {
@@ -882,13 +781,13 @@ struct test {
std::vector<std::string> values = {
build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_ubatch),
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
std::to_string(main_gpu), std::to_string(no_kv_offload),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
@@ -917,7 +816,6 @@ const bool test::metal = !!ggml_cpu_has_metal();
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
const bool test::blas = !!ggml_cpu_has_blas();
const bool test::sycl = !!ggml_cpu_has_sycl();
const bool test::rpc = !!ggml_cpu_has_rpc();
const std::string test::cpu_info = get_cpu_info();
const std::string test::gpu_info = get_gpu_info();
@@ -1035,9 +933,6 @@ struct markdown_printer : public printer {
if (field == "n_gpu_layers") {
return 3;
}
if (field == "test") {
return 13;
}
int width = std::max((int)field.length(), 10);
@@ -1060,9 +955,6 @@ struct markdown_printer : public printer {
if (field == "no_kv_offload") {
return "nkvo";
}
if (field == "flash_attn") {
return "fa";
}
if (field == "use_mmap") {
return "mmap";
}
@@ -1109,9 +1001,6 @@ struct markdown_printer : public printer {
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
fields.emplace_back("no_kv_offload");
}
if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
fields.emplace_back("flash_attn");
}
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
fields.emplace_back("tensor_split");
}
@@ -1164,11 +1053,12 @@ struct markdown_printer : public printer {
value = test::get_backend();
} else if (field == "test") {
if (t.n_prompt > 0 && t.n_gen == 0) {
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
} else if (t.n_gen > 0 && t.n_prompt == 0) {
snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
snprintf(buf, sizeof(buf), "tg %d", t.n_gen);
} else {
snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
assert(false);
exit(1);
}
value = buf;
} else if (field == "t/s") {
@@ -1301,7 +1191,6 @@ int main(int argc, char ** argv) {
llama_log_set(llama_null_log_callback, NULL);
}
llama_backend_init();
llama_numa_init(params.numa);
// initialize printer
std::unique_ptr<printer> p;
@@ -1369,7 +1258,6 @@ int main(int argc, char ** argv) {
llama_kv_cache_clear(ctx);
uint64_t t_start = get_time_ns();
if (t.n_prompt > 0) {
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
}

View File

@@ -7,6 +7,8 @@ android {
namespace = "com.example.llama"
compileSdk = 34
ndkVersion = "26.1.10909125"
defaultConfig {
applicationId = "com.example.llama"
minSdk = 33
@@ -18,6 +20,17 @@ android {
vectorDrawables {
useSupportLibrary = true
}
ndk {
// Add NDK properties if wanted, e.g.
// abiFilters += listOf("arm64-v8a")
}
externalNativeBuild {
cmake {
arguments += "-DCMAKE_BUILD_TYPE=Release"
cppFlags += listOf()
arguments += listOf()
}
}
}
buildTypes {
@@ -42,6 +55,17 @@ android {
composeOptions {
kotlinCompilerExtensionVersion = "1.5.1"
}
packaging {
resources {
excludes += "/META-INF/{AL2.0,LGPL2.1}"
}
}
externalNativeBuild {
cmake {
path = file("src/main/cpp/CMakeLists.txt")
version = "3.22.1"
}
}
}
dependencies {
@@ -54,7 +78,6 @@ dependencies {
implementation("androidx.compose.ui:ui-graphics")
implementation("androidx.compose.ui:ui-tooling-preview")
implementation("androidx.compose.material3:material3")
implementation(project(":llama"))
testImplementation("junit:junit:4.13.2")
androidTestImplementation("androidx.test.ext:junit:1.1.5")
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")

Some files were not shown because too many files have changed in this diff Show More