mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
186 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
33eff40240 | ||
|
|
17512a94d6 | ||
|
|
611aa914ef | ||
|
|
0cf6725e9f | ||
|
|
27ebfcacba | ||
|
|
5c86c9ed3e | ||
|
|
efb8b47eda | ||
|
|
0527771dd8 | ||
|
|
2189fd3b63 | ||
|
|
3f96aeff39 | ||
|
|
b486ba05bf | ||
|
|
02115dcd9a | ||
|
|
d9c4accaff | ||
|
|
15e03282bb | ||
|
|
f05a6d71a0 | ||
|
|
ee01d71e58 | ||
|
|
8c83449cb7 | ||
|
|
1a844be132 | ||
|
|
0ccc121354 | ||
|
|
6562e5a4d6 | ||
|
|
51fb96b1ff | ||
|
|
70a6991edf | ||
|
|
f061021206 | ||
|
|
8733e0cf6e | ||
|
|
814f795e06 | ||
|
|
d879433824 | ||
|
|
13b0a04597 | ||
|
|
bba9d945c1 | ||
|
|
bc4e1128f7 | ||
|
|
39e73ae0d6 | ||
|
|
1f73301b63 | ||
|
|
4773d7a02f | ||
|
|
6c7fd67b64 | ||
|
|
141a908a59 | ||
|
|
32916a4907 | ||
|
|
ffc727203a | ||
|
|
91a86a6f35 | ||
|
|
f4ed10b69c | ||
|
|
1e333d5bba | ||
|
|
2f54e348ad | ||
|
|
2356fb1d53 | ||
|
|
764b85627b | ||
|
|
15a28ec8c7 | ||
|
|
a7366faa5b | ||
|
|
9070365020 | ||
|
|
233461f812 | ||
|
|
b34c859146 | ||
|
|
9b61acf060 | ||
|
|
5215b91e93 | ||
|
|
ae803bfc3d | ||
|
|
66645a5285 | ||
|
|
27aa259532 | ||
|
|
9fdfcdaedd | ||
|
|
6eb7d25c70 | ||
|
|
86bd60d3fe | ||
|
|
9f2da5871f | ||
|
|
93c4e23905 | ||
|
|
8afbd96818 | ||
|
|
8ae5ebcf85 | ||
|
|
3e959f0976 | ||
|
|
36667c8edc | ||
|
|
3bf785f3ef | ||
|
|
1d36b3670b | ||
|
|
b34443923c | ||
|
|
a75cb30dc9 | ||
|
|
3f3769ba76 | ||
|
|
2f567611c0 | ||
|
|
7d2123484e | ||
|
|
074e42ab31 | ||
|
|
c642bc014c | ||
|
|
cb06a3c363 | ||
|
|
626083faf7 | ||
|
|
2af6880178 | ||
|
|
e84773ab60 | ||
|
|
fab647e884 | ||
|
|
dcf886007d | ||
|
|
d24d592808 | ||
|
|
8efbdadc61 | ||
|
|
f057808ffa | ||
|
|
d7a14c42a1 | ||
|
|
b6e4ff69b8 | ||
|
|
e0f572c846 | ||
|
|
79f26e9e12 | ||
|
|
fc727bcdd5 | ||
|
|
b0ecbd434b | ||
|
|
b1dd4d08e8 | ||
|
|
99881f77d8 | ||
|
|
b5769d92b4 | ||
|
|
8936784f7a | ||
|
|
13c9a3319b | ||
|
|
a70183eb00 | ||
|
|
8d33d740c3 | ||
|
|
4254bb4951 | ||
|
|
9998540149 | ||
|
|
e1e8e0991f | ||
|
|
6f67cf1f48 | ||
|
|
16a457facd | ||
|
|
3e168bede4 | ||
|
|
ceda28ef8e | ||
|
|
3b127c7385 | ||
|
|
e5007a5edf | ||
|
|
416313773b | ||
|
|
07c2e2f76c | ||
|
|
44cd8d91ff | ||
|
|
5933e6fdc9 | ||
|
|
da84c04d8f | ||
|
|
a0f7016d17 | ||
|
|
19e899ce21 | ||
|
|
e2e1ddb93a | ||
|
|
d9d398f84f | ||
|
|
5a63980117 | ||
|
|
cdf76586b2 | ||
|
|
7d3af70b08 | ||
|
|
00e3e5a194 | ||
|
|
e98b3692be | ||
|
|
b6ce7430b7 | ||
|
|
5f5e39e1ba | ||
|
|
eaea325324 | ||
|
|
43ddab6eee | ||
|
|
1831f538f7 | ||
|
|
4e87962e34 | ||
|
|
fb0471d175 | ||
|
|
d2b2031e5f | ||
|
|
5fa9e63be8 | ||
|
|
a4c340f974 | ||
|
|
d0a417f3c7 | ||
|
|
43f2b07193 | ||
|
|
e5d6c2554e | ||
|
|
f0dd6a1926 | ||
|
|
69699be48a | ||
|
|
85f36e5e71 | ||
|
|
c0a97b762e | ||
|
|
ced44be342 | ||
|
|
e291450b76 | ||
|
|
59e991c23c | ||
|
|
ca2bb89eac | ||
|
|
2d451c8059 | ||
|
|
4753791e70 | ||
|
|
77d5e9a76a | ||
|
|
d5fe4e81bd | ||
|
|
295354ea68 | ||
|
|
558a764713 | ||
|
|
edb18b6e8f | ||
|
|
514c45608f | ||
|
|
553a5c3a9f | ||
|
|
13be08daf9 | ||
|
|
226251ed56 | ||
|
|
87616f0680 | ||
|
|
63b4911494 | ||
|
|
c6e8cc28c1 | ||
|
|
b10d8bfdb1 | ||
|
|
13b4548877 | ||
|
|
572b3141d3 | ||
|
|
7c727fbe39 | ||
|
|
80982e815e | ||
|
|
7604a7d6b8 | ||
|
|
b3b6d862cf | ||
|
|
5630406959 | ||
|
|
ecda2ec4b3 | ||
|
|
eb1776b15a | ||
|
|
2cca6c01e4 | ||
|
|
658987cfc9 | ||
|
|
dc39a5e7a8 | ||
|
|
ab47dec3d3 | ||
|
|
7b53389c24 | ||
|
|
243453533e | ||
|
|
1d735c0b4f | ||
|
|
5368ddda7a | ||
|
|
84a9bf2fc2 | ||
|
|
2016f07bd1 | ||
|
|
6602304814 | ||
|
|
66168204be | ||
|
|
4ba9d711ba | ||
|
|
00137157fc | ||
|
|
fb28f4f80e | ||
|
|
37b9f0d29d | ||
|
|
6408210082 | ||
|
|
aff9d107b0 | ||
|
|
35370ba945 | ||
|
|
8d66005763 | ||
|
|
b9154ecff9 | ||
|
|
2db9ba1464 | ||
|
|
2f74c354c0 | ||
|
|
207c22ec2d | ||
|
|
7a395f67a7 | ||
|
|
971f245b3b |
@@ -13,6 +13,7 @@ Checks: >
|
||||
-readability-magic-numbers,
|
||||
-readability-uppercase-literal-suffix,
|
||||
-readability-simplify-boolean-expr,
|
||||
-readability-math-missing-parentheses,
|
||||
clang-analyzer-*,
|
||||
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
|
||||
performance-*,
|
||||
|
||||
@@ -14,9 +14,9 @@ WORKDIR /app
|
||||
COPY . .
|
||||
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
|
||||
elif [ "$TARGETARCH" = "arm64" ]; then \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
|
||||
else \
|
||||
echo "Unsupported architecture"; \
|
||||
exit 1; \
|
||||
|
||||
@@ -21,7 +21,7 @@ COPY . .
|
||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
|
||||
@@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
echo "Building with dynamic libs" && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${OPT_SYCL_F16} && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
|
||||
@@ -22,7 +22,7 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
||||
|
||||
RUN echo "Building with static libs" && \
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
|
||||
cmake --build build --config Release --target llama-cli
|
||||
|
||||
# TODO: use image with NNRT
|
||||
|
||||
@@ -35,7 +35,7 @@ COPY . .
|
||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
|
||||
@@ -40,7 +40,7 @@ WORKDIR /app
|
||||
COPY . .
|
||||
|
||||
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
|
||||
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
|
||||
&& cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib \
|
||||
|
||||
@@ -16,7 +16,7 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
|
||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
|
||||
@@ -21,15 +21,15 @@ indent_style = tab
|
||||
[prompts/*.txt]
|
||||
insert_final_newline = unset
|
||||
|
||||
[examples/server/public/*]
|
||||
[tools/server/public/*]
|
||||
indent_size = 2
|
||||
|
||||
[examples/server/public/deps_*]
|
||||
[tools/server/public/deps_*]
|
||||
trim_trailing_whitespace = unset
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
|
||||
[examples/server/deps_*]
|
||||
[tools/server/deps_*]
|
||||
trim_trailing_whitespace = unset
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
@@ -37,7 +37,7 @@ indent_size = unset
|
||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||
indent_style = tab
|
||||
|
||||
[examples/cvector-generator/*.txt]
|
||||
[tools/cvector-generator/*.txt]
|
||||
trim_trailing_whitespace = unset
|
||||
insert_final_newline = unset
|
||||
|
||||
|
||||
3
.flake8
3
.flake8
@@ -2,8 +2,9 @@
|
||||
max-line-length = 125
|
||||
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
|
||||
exclude =
|
||||
# Do not traverse examples
|
||||
# Do not traverse examples and tools
|
||||
examples,
|
||||
tools,
|
||||
# Do not include package initializers
|
||||
__init__.py,
|
||||
# No need to traverse our git directory
|
||||
|
||||
22
.github/actions/get-tag-name/action.yml
vendored
Normal file
22
.github/actions/get-tag-name/action.yml
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
name: "Determine tag name"
|
||||
description: "Determine the tag name to use for a release"
|
||||
outputs:
|
||||
name:
|
||||
description: "The name of the tag"
|
||||
value: ${{ steps.tag.outputs.name }}
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
67
.github/actions/windows-setup-cuda/action.yml
vendored
Normal file
67
.github/actions/windows-setup-cuda/action.yml
vendored
Normal file
@@ -0,0 +1,67 @@
|
||||
name: "Windows - Setup CUDA Toolkit"
|
||||
description: "Setup CUDA Toolkit for Windows"
|
||||
inputs:
|
||||
cuda_version:
|
||||
description: "CUDA toolkit version"
|
||||
required: true
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Install Cuda Toolkit 11.7
|
||||
if: ${{ inputs.cuda_version == '11.7' }}
|
||||
shell: pwsh
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install Cuda Toolkit 12.4
|
||||
if: ${{ inputs.cuda_version == '12.4' }}
|
||||
shell: pwsh
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
6
.github/labeler.yml
vendored
6
.github/labeler.yml
vendored
@@ -45,7 +45,9 @@ build:
|
||||
- CMakePresets.json
|
||||
examples:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file: examples/**
|
||||
- any-glob-to-any-file:
|
||||
- examples/**
|
||||
- tools/**
|
||||
devops:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
@@ -70,7 +72,7 @@ android:
|
||||
server:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- examples/server/**
|
||||
- tools/server/**
|
||||
ggml:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
||||
30
.github/workflows/bench.yml.disabled
vendored
30
.github/workflows/bench.yml.disabled
vendored
@@ -27,10 +27,10 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
|
||||
pull_request_target:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
|
||||
schedule:
|
||||
- cron: '04 2 * * *'
|
||||
|
||||
@@ -69,7 +69,7 @@ jobs:
|
||||
- name: Install python env
|
||||
id: pipenv
|
||||
run: |
|
||||
cd examples/server/bench
|
||||
cd tools/server/bench
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
@@ -79,7 +79,7 @@ jobs:
|
||||
run: |
|
||||
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
|
||||
tar xzf prometheus*.tar.gz --strip-components=1
|
||||
./prometheus --config.file=examples/server/bench/prometheus.yml &
|
||||
./prometheus --config.file=tools/server/bench/prometheus.yml &
|
||||
while ! nc -z localhost 9090; do
|
||||
sleep 0.1
|
||||
done
|
||||
@@ -92,7 +92,7 @@ jobs:
|
||||
- name: Install k6 and xk6-sse
|
||||
id: k6_installation
|
||||
run: |
|
||||
cd examples/server/bench
|
||||
cd tools/server/bench
|
||||
go install go.k6.io/xk6/cmd/xk6@latest
|
||||
xk6 build master \
|
||||
--with github.com/phymbert/xk6-sse
|
||||
@@ -116,7 +116,7 @@ jobs:
|
||||
- name: Download the dataset
|
||||
id: download_dataset
|
||||
run: |
|
||||
cd examples/server/bench
|
||||
cd tools/server/bench
|
||||
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
- name: Server bench
|
||||
@@ -126,7 +126,7 @@ jobs:
|
||||
run: |
|
||||
set -eux
|
||||
|
||||
cd examples/server/bench
|
||||
cd tools/server/bench
|
||||
source venv/bin/activate
|
||||
python bench.py \
|
||||
--runner-label ${{ env.RUNNER_LABEL }} \
|
||||
@@ -157,9 +157,9 @@ jobs:
|
||||
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
||||
compression-level: 9
|
||||
path: |
|
||||
examples/server/bench/*.jpg
|
||||
examples/server/bench/*.json
|
||||
examples/server/bench/*.log
|
||||
tools/server/bench/*.jpg
|
||||
tools/server/bench/*.json
|
||||
tools/server/bench/*.log
|
||||
|
||||
- name: Commit status
|
||||
uses: Sibz/github-status-action@v1
|
||||
@@ -178,17 +178,17 @@ jobs:
|
||||
with:
|
||||
client_id: ${{secrets.IMGUR_CLIENT_ID}}
|
||||
path: |
|
||||
examples/server/bench/prompt_tokens_seconds.jpg
|
||||
examples/server/bench/predicted_tokens_seconds.jpg
|
||||
examples/server/bench/kv_cache_usage_ratio.jpg
|
||||
examples/server/bench/requests_processing.jpg
|
||||
tools/server/bench/prompt_tokens_seconds.jpg
|
||||
tools/server/bench/predicted_tokens_seconds.jpg
|
||||
tools/server/bench/kv_cache_usage_ratio.jpg
|
||||
tools/server/bench/requests_processing.jpg
|
||||
|
||||
- name: Extract mermaid
|
||||
id: set_mermaid
|
||||
run: |
|
||||
set -eux
|
||||
|
||||
cd examples/server/bench
|
||||
cd tools/server/bench
|
||||
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
|
||||
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
|
||||
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
|
||||
|
||||
66
.github/workflows/build-linux-cross.yml
vendored
66
.github/workflows/build-linux-cross.yml
vendored
@@ -4,18 +4,25 @@ on:
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
ubuntu-latest-riscv64-cpu-cross:
|
||||
runs-on: ubuntu-latest
|
||||
ubuntu-24-riscv64-cpu-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup Riscv
|
||||
run: |
|
||||
sudo dpkg --add-architecture riscv64
|
||||
sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
|
||||
/etc/apt/sources.list /etc/apt/apt-mirrors.txt
|
||||
sudo apt-get clean
|
||||
sudo apt-get update
|
||||
|
||||
# Add arch-specific repositories for non-amd64 architectures
|
||||
cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||
EOF
|
||||
|
||||
sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gcc-14-riscv64-linux-gnu \
|
||||
@@ -27,6 +34,7 @@ jobs:
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
||||
@@ -40,21 +48,25 @@ jobs:
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-latest-riscv64-vulkan-cross:
|
||||
runs-on: ubuntu-latest
|
||||
ubuntu-24-riscv64-vulkan-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Riscv
|
||||
run: |
|
||||
sudo dpkg --add-architecture riscv64
|
||||
sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
|
||||
/etc/apt/sources.list /etc/apt/apt-mirrors.txt
|
||||
sudo apt-get clean
|
||||
sudo apt-get update
|
||||
|
||||
# Add arch-specific repositories for non-amd64 architectures
|
||||
cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||
EOF
|
||||
|
||||
sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
glslc \
|
||||
@@ -69,6 +81,7 @@ jobs:
|
||||
-DGGML_VULKAN=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
||||
@@ -82,21 +95,25 @@ jobs:
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-latest-arm64-vulkan-cross:
|
||||
runs-on: ubuntu-latest
|
||||
ubuntu-24-arm64-vulkan-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Arm64
|
||||
run: |
|
||||
sudo dpkg --add-architecture arm64
|
||||
sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
|
||||
/etc/apt/sources.list /etc/apt/apt-mirrors.txt
|
||||
sudo apt-get clean
|
||||
sudo apt-get update
|
||||
|
||||
# Add arch-specific repositories for non-amd64 architectures
|
||||
cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
|
||||
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||
EOF
|
||||
|
||||
sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
glslc \
|
||||
@@ -110,6 +127,7 @@ jobs:
|
||||
-DGGML_VULKAN=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=aarch64 \
|
||||
|
||||
796
.github/workflows/build.yml
vendored
796
.github/workflows/build.yml
vendored
File diff suppressed because it is too large
Load Diff
8
.github/workflows/docker.yml
vendored
8
.github/workflows/docker.yml
vendored
@@ -36,10 +36,14 @@ jobs:
|
||||
matrix:
|
||||
config:
|
||||
# Multi-stage build
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
|
||||
# Note: the arm64 images are failing, which prevents the amd64 images from being built
|
||||
# https://github.com/ggml-org/llama.cpp/issues/11888
|
||||
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
# Note: the intel images are failing due to an out of disk space error
|
||||
# - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
|
||||
|
||||
709
.github/workflows/release.yml
vendored
Normal file
709
.github/workflows/release.yml
vendored
Normal file
@@ -0,0 +1,709 @@
|
||||
name: Create Release
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
create_release:
|
||||
description: 'Create new release'
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
|
||||
|
||||
jobs:
|
||||
macOS-arm64:
|
||||
runs-on: macos-14
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-arm64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
continue-on-error: true
|
||||
run: |
|
||||
brew update
|
||||
brew install curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DGGML_RPC=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
|
||||
name: llama-bin-macos-arm64.zip
|
||||
|
||||
macOS-x64:
|
||||
runs-on: macos-13
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-x64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
continue-on-error: true
|
||||
run: |
|
||||
brew update
|
||||
brew install curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
||||
name: llama-bin-macos-x64.zip
|
||||
|
||||
ubuntu-22-cpu:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'x64'
|
||||
os: ubuntu-22.04
|
||||
- build: 'arm64'
|
||||
os: ubuntu-22.04-arm
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-cpu-cmake
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
|
||||
name: llama-bin-ubuntu-${{ matrix.build }}.zip
|
||||
|
||||
ubuntu-22-vulkan:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-vulkan
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
|
||||
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_VULKAN=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
|
||||
name: llama-bin-ubuntu-vulkan-x64.zip
|
||||
|
||||
windows:
|
||||
runs-on: windows-latest
|
||||
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
VULKAN_VERSION: 1.4.309.0
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'cpu-x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
|
||||
#- build: 'openblas-x64'
|
||||
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'vulkan-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
- build: 'cpu-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
|
||||
- build: 'opencl-adreno-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-${{ matrix.build }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
|
||||
mkdir $env:RUNNER_TEMP/openblas
|
||||
tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
|
||||
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
|
||||
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
|
||||
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
|
||||
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'vulkan-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Install OpenCL Headers and Libs
|
||||
id: install_opencl
|
||||
if: ${{ matrix.build == 'opencl-adreno-arm64' }}
|
||||
run: |
|
||||
git clone https://github.com/KhronosGroup/OpenCL-Headers
|
||||
cd OpenCL-Headers
|
||||
cmake -B build `
|
||||
-DBUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build build --target install
|
||||
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
|
||||
cd OpenCL-ICD-Loader
|
||||
cmake -B build-arm64-release `
|
||||
-A arm64 `
|
||||
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build build-arm64-release --target install --config release
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cmake -S . -B build ${{ matrix.defines }} `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
||||
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||
name: llama-bin-win-${{ matrix.build }}.zip
|
||||
|
||||
windows-cuda:
|
||||
runs-on: windows-2019
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4', '11.7']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-cuda-${{ matrix.cuda }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install Cuda Toolkit
|
||||
uses: ./.github/actions/windows-setup-cuda
|
||||
with:
|
||||
cuda_version: ${{ matrix.cuda }}
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
-DGGML_CPU_ALL_VARIANTS=ON ^
|
||||
-DGGML_CUDA=ON ^
|
||||
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
|
||||
${{ env.CMAKE_ARGS }}
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||
cmake --build build --config Release
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
run: |
|
||||
echo "Cuda install location: ${{ env.CUDA_PATH }}"
|
||||
$dst='.\build\bin\cudart\'
|
||||
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\*
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
|
||||
windows-sycl:
|
||||
runs-on: windows-latest
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
# TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Build the release package
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload the release package
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
windows-hip:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
gpu_target: [gfx1100, gfx1101, gfx1030]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Clone rocWMMA repository
|
||||
id: clone_rocwmma
|
||||
run: |
|
||||
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-hip-release
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install
|
||||
id: depends
|
||||
run: |
|
||||
$ErrorActionPreference = "Stop"
|
||||
write-host "Downloading AMD HIP SDK Installer"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
write-host "Installing AMD HIP SDK"
|
||||
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||
write-host "Completed AMD HIP SDK installation"
|
||||
|
||||
- name: Verify ROCm
|
||||
id: verify
|
||||
run: |
|
||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGGML_HIP=ON `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
md "build\bin\rocblas\library\"
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
./build-xcframework.sh
|
||||
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
name: llama-${{ steps.tag.outputs.name }}-xcframework
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
|
||||
# Fine-grant permission
|
||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
||||
permissions:
|
||||
contents: write # for creating release
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
needs:
|
||||
- ubuntu-22-cpu
|
||||
- ubuntu-22-vulkan
|
||||
- windows
|
||||
- windows-cuda
|
||||
- windows-sycl
|
||||
- windows-hip
|
||||
- macOS-arm64
|
||||
- macOS-x64
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Download artifacts
|
||||
id: download-artifact
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: ./artifact
|
||||
|
||||
- name: Move artifacts
|
||||
id: move_artifacts
|
||||
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
uses: ggml-org/action-create-release@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
|
||||
- name: Upload release
|
||||
id: upload_release
|
||||
uses: actions/github-script@v3
|
||||
with:
|
||||
github-token: ${{secrets.GITHUB_TOKEN}}
|
||||
script: |
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const release_id = '${{ steps.create_release.outputs.id }}';
|
||||
for (let file of await fs.readdirSync('./artifact/release')) {
|
||||
if (path.extname(file) === '.zip') {
|
||||
console.log('uploadReleaseAsset', file);
|
||||
await github.repos.uploadReleaseAsset({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
release_id: release_id,
|
||||
name: file,
|
||||
data: await fs.readFileSync(`./artifact/release/${file}`)
|
||||
});
|
||||
}
|
||||
}
|
||||
24
.github/workflows/server.yml
vendored
24
.github/workflows/server.yml
vendored
@@ -15,10 +15,10 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
@@ -74,7 +74,7 @@ jobs:
|
||||
- name: Tests dependencies
|
||||
id: test_dependencies
|
||||
run: |
|
||||
pip install -r examples/server/tests/requirements.txt
|
||||
pip install -r tools/server/tests/requirements.txt
|
||||
|
||||
# Setup nodejs (to be used for verifying bundled index.html)
|
||||
- uses: actions/setup-node@v4
|
||||
@@ -84,14 +84,14 @@ jobs:
|
||||
- name: WebUI - Install dependencies
|
||||
id: webui_lint
|
||||
run: |
|
||||
cd examples/server/webui
|
||||
cd tools/server/webui
|
||||
npm ci
|
||||
|
||||
- name: WebUI - Check code format
|
||||
id: webui_format
|
||||
run: |
|
||||
git config --global --add safe.directory $(realpath .)
|
||||
cd examples/server/webui
|
||||
cd tools/server/webui
|
||||
git status
|
||||
|
||||
npm run format
|
||||
@@ -108,7 +108,7 @@ jobs:
|
||||
id: verify_server_index_html
|
||||
run: |
|
||||
git config --global --add safe.directory $(realpath .)
|
||||
cd examples/server/webui
|
||||
cd tools/server/webui
|
||||
git status
|
||||
|
||||
npm run build
|
||||
@@ -161,21 +161,21 @@ jobs:
|
||||
env:
|
||||
GITHUB_ACTIONS: "true"
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
cd tools/server/tests
|
||||
./tests.sh
|
||||
|
||||
- name: Tests (sanitizers)
|
||||
id: server_integration_tests_sanitizers
|
||||
if: ${{ matrix.sanitizer != '' }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
cd tools/server/tests
|
||||
LLAMA_SANITIZE=1 ./tests.sh
|
||||
|
||||
- name: Slow tests
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
cd tools/server/tests
|
||||
SLOW_TESTS=1 ./tests.sh
|
||||
|
||||
|
||||
@@ -211,7 +211,7 @@ jobs:
|
||||
- name: Tests dependencies
|
||||
id: test_dependencies
|
||||
run: |
|
||||
pip install -r examples/server/tests/requirements.txt
|
||||
pip install -r tools/server/tests/requirements.txt
|
||||
|
||||
- name: Copy Libcurl
|
||||
id: prepare_libcurl
|
||||
@@ -224,7 +224,7 @@ jobs:
|
||||
id: server_integration_tests
|
||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
cd tools/server/tests
|
||||
$env:PYTHONIOENCODING = ":replace"
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
@@ -232,6 +232,6 @@ jobs:
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
cd tools/server/tests
|
||||
$env:SLOW_TESTS = "1"
|
||||
pytest -v -x
|
||||
|
||||
12
.gitignore
vendored
12
.gitignore
vendored
@@ -96,11 +96,11 @@ perf-*.txt
|
||||
# Examples
|
||||
|
||||
examples/jeopardy/results.txt
|
||||
examples/server/*.css.hpp
|
||||
examples/server/*.html.hpp
|
||||
examples/server/*.js.hpp
|
||||
examples/server/*.mjs.hpp
|
||||
examples/server/*.gz.hpp
|
||||
tools/server/*.css.hpp
|
||||
tools/server/*.html.hpp
|
||||
tools/server/*.js.hpp
|
||||
tools/server/*.mjs.hpp
|
||||
tools/server/*.gz.hpp
|
||||
!build_64.sh
|
||||
!examples/*.bat
|
||||
!examples/*/*.kts
|
||||
@@ -110,7 +110,7 @@ examples/server/*.gz.hpp
|
||||
|
||||
# Server Web UI temporary files
|
||||
node_modules
|
||||
examples/server/webui/dist
|
||||
tools/server/webui/dist
|
||||
|
||||
# Python
|
||||
|
||||
|
||||
@@ -77,6 +77,7 @@ option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE
|
||||
|
||||
# extra artifacts
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
|
||||
@@ -187,6 +188,10 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
|
||||
add_subdirectory(pocs)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
|
||||
add_subdirectory(tools)
|
||||
endif()
|
||||
|
||||
#
|
||||
# install
|
||||
#
|
||||
@@ -247,20 +252,3 @@ configure_file(cmake/llama.pc.in
|
||||
|
||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
||||
|
||||
#
|
||||
# copy the license files
|
||||
#
|
||||
|
||||
# Check if running in GitHub Actions
|
||||
if(DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
|
||||
message(STATUS "Running inside GitHub Actions - copying license files")
|
||||
|
||||
# Copy all files from licenses/ to build/bin/
|
||||
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
|
||||
foreach(LICENSE_FILE ${LICENSE_FILES})
|
||||
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
|
||||
configure_file(${LICENSE_FILE} "${CMAKE_BINARY_DIR}/bin/${FILENAME}" COPYONLY)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
|
||||
@@ -38,15 +38,6 @@
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-windows-msvc", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-windows-llvm", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
@@ -73,10 +64,6 @@
|
||||
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
|
||||
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
|
||||
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
|
||||
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
/ci/ @ggerganov
|
||||
/.devops/*.Dockerfile @ngxson
|
||||
/examples/server/ @ngxson
|
||||
/tools/server/ @ngxson
|
||||
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
|
||||
|
||||
97
Makefile
97
Makefile
@@ -1156,10 +1156,10 @@ $(LIB_COMMON_S): $(OBJ_COMMON)
|
||||
|
||||
# Clean generated server assets
|
||||
clean-server-assets:
|
||||
find examples/server -type f -name "*.js.hpp" -delete
|
||||
find examples/server -type f -name "*.mjs.hpp" -delete
|
||||
find examples/server -type f -name "*.css.hpp" -delete
|
||||
find examples/server -type f -name "*.html.hpp" -delete
|
||||
find tools/server -type f -name "*.js.hpp" -delete
|
||||
find tools/server -type f -name "*.mjs.hpp" -delete
|
||||
find tools/server -type f -name "*.css.hpp" -delete
|
||||
find tools/server -type f -name "*.html.hpp" -delete
|
||||
|
||||
# Clean rule
|
||||
clean: clean-server-assets
|
||||
@@ -1179,7 +1179,7 @@ clean: clean-server-assets
|
||||
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
|
||||
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
|
||||
|
||||
llama-cli: examples/main/main.cpp \
|
||||
llama-cli: tools/main/main.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@@ -1187,12 +1187,7 @@ llama-cli: examples/main/main.cpp \
|
||||
@echo '==== Run ./llama-cli -h for help. ===='
|
||||
@echo
|
||||
|
||||
llama-infill: examples/infill/infill.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-run: examples/run/run.cpp \
|
||||
llama-run: tools/run/run.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@@ -1207,7 +1202,7 @@ llama-simple-chat: examples/simple-chat/simple-chat.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-tokenize: examples/tokenize/tokenize.cpp \
|
||||
llama-tokenize: tools/tokenize/tokenize.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@@ -1217,27 +1212,27 @@ llama-batched: examples/batched/batched.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-batched-bench: examples/batched-bench/batched-bench.cpp \
|
||||
llama-batched-bench: tools/batched-bench/batched-bench.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-quantize: examples/quantize/quantize.cpp \
|
||||
llama-quantize: tools/quantize/quantize.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
|
||||
llama-quantize-stats: tools/quantize-stats/quantize-stats.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-perplexity: examples/perplexity/perplexity.cpp \
|
||||
llama-perplexity: tools/perplexity/perplexity.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-imatrix: examples/imatrix/imatrix.cpp \
|
||||
llama-imatrix: tools/imatrix/imatrix.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@@ -1279,7 +1274,7 @@ llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/s
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-gguf-split: examples/gguf-split/gguf-split.cpp \
|
||||
llama-gguf-split: tools/gguf-split/gguf-split.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@@ -1289,7 +1284,7 @@ llama-eval-callback: examples/eval-callback/eval-callback.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
|
||||
llama-cvector-generator: tools/cvector-generator/cvector-generator.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@@ -1299,12 +1294,12 @@ llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-bench: examples/llama-bench/llama-bench.cpp \
|
||||
llama-bench: tools/llama-bench/llama-bench.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-export-lora: examples/export-lora/export-lora.cpp \
|
||||
llama-export-lora: tools/export-lora/export-lora.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@@ -1360,17 +1355,17 @@ llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
ifdef GGML_RPC
|
||||
rpc-server: examples/rpc/rpc-server.cpp \
|
||||
rpc-server: tools/rpc/rpc-server.cpp \
|
||||
$(OBJ_GGML)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
endif # GGML_RPC
|
||||
|
||||
llama-server: \
|
||||
examples/server/server.cpp \
|
||||
examples/server/utils.hpp \
|
||||
examples/server/httplib.h \
|
||||
examples/server/index.html.hpp \
|
||||
examples/server/loading.html.hpp \
|
||||
tools/server/server.cpp \
|
||||
tools/server/utils.hpp \
|
||||
tools/server/httplib.h \
|
||||
tools/server/index.html.hpp \
|
||||
tools/server/loading.html.hpp \
|
||||
common/chat.cpp \
|
||||
common/chat.h \
|
||||
common/chat-template.hpp \
|
||||
@@ -1378,10 +1373,10 @@ llama-server: \
|
||||
common/minja.hpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Itools/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||
|
||||
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
|
||||
examples/server/%.hpp: examples/server/public/% FORCE Makefile
|
||||
# Portable equivalent of `cd tools/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
|
||||
tools/server/%.hpp: tools/server/public/% FORCE Makefile
|
||||
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
|
||||
echo "unsigned char $${NAME}[] = {" && \
|
||||
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
|
||||
@@ -1394,36 +1389,36 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
libllava.a: examples/llava/llava.cpp \
|
||||
examples/llava/llava.h \
|
||||
examples/llava/clip.cpp \
|
||||
examples/llava/clip.h \
|
||||
libllava.a: tools/mtmd/llava.cpp \
|
||||
tools/mtmd/llava.h \
|
||||
tools/mtmd/clip.cpp \
|
||||
tools/mtmd/clip.h \
|
||||
common/stb_image.h \
|
||||
common/base64.hpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
||||
|
||||
llama-llava-cli: examples/llava/llava-cli.cpp \
|
||||
examples/llava/llava.cpp \
|
||||
examples/llava/llava.h \
|
||||
examples/llava/clip.cpp \
|
||||
examples/llava/clip.h \
|
||||
llama-llava-cli: tools/mtmd/llava-cli.cpp \
|
||||
tools/mtmd/llava.cpp \
|
||||
tools/mtmd/llava.h \
|
||||
tools/mtmd/clip.cpp \
|
||||
tools/mtmd/clip.h \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
|
||||
examples/llava/llava.cpp \
|
||||
examples/llava/llava.h \
|
||||
examples/llava/clip.cpp \
|
||||
examples/llava/clip.h \
|
||||
llama-minicpmv-cli: tools/mtmd/minicpmv-cli.cpp \
|
||||
tools/mtmd/llava.cpp \
|
||||
tools/mtmd/llava.h \
|
||||
tools/mtmd/clip.cpp \
|
||||
tools/mtmd/clip.h \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
|
||||
examples/llava/llava.cpp \
|
||||
examples/llava/llava.h \
|
||||
examples/llava/clip.cpp \
|
||||
examples/llava/clip.h \
|
||||
llama-qwen2vl-cli: tools/mtmd/qwen2vl-cli.cpp \
|
||||
tools/mtmd/llava.cpp \
|
||||
tools/mtmd/llava.h \
|
||||
tools/mtmd/clip.cpp \
|
||||
tools/mtmd/clip.h \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
@@ -1480,12 +1475,12 @@ tests/test-double-float: tests/test-double-float.cpp
|
||||
|
||||
tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-chat: tests/test-chat.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-opt: tests/test-opt.cpp \
|
||||
|
||||
26
README.md
26
README.md
@@ -16,8 +16,10 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
## Hot topics
|
||||
|
||||
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
|
||||
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
|
||||
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
||||
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
|
||||
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
|
||||
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
||||
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
|
||||
@@ -241,7 +243,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [Vulkan](docs/build.md#vulkan) | GPU |
|
||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) | All |
|
||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
||||
|
||||
## Building the project
|
||||
|
||||
@@ -275,9 +277,9 @@ The Hugging Face platform provides a variety of online tools for converting, qua
|
||||
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
|
||||
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)
|
||||
|
||||
To learn more about model quantization, [read this documentation](examples/quantize/README.md)
|
||||
To learn more about model quantization, [read this documentation](tools/quantize/README.md)
|
||||
|
||||
## [`llama-cli`](examples/main)
|
||||
## [`llama-cli`](tools/main)
|
||||
|
||||
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
|
||||
|
||||
@@ -340,7 +342,7 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||
</details>
|
||||
|
||||
|
||||
## [`llama-server`](examples/server)
|
||||
## [`llama-server`](tools/server)
|
||||
|
||||
#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
|
||||
|
||||
@@ -410,7 +412,7 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||
</details>
|
||||
|
||||
|
||||
## [`llama-perplexity`](examples/perplexity)
|
||||
## [`llama-perplexity`](tools/perplexity)
|
||||
|
||||
#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
|
||||
|
||||
@@ -435,10 +437,10 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||
|
||||
</details>
|
||||
|
||||
[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md)
|
||||
[^1]: [tools/perplexity/README.md](./tools/perplexity/README.md)
|
||||
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
|
||||
|
||||
## [`llama-bench`](examples/llama-bench)
|
||||
## [`llama-bench`](tools/llama-bench)
|
||||
|
||||
#### Benchmark the performance of the inference for various parameters.
|
||||
|
||||
@@ -459,7 +461,7 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||
|
||||
</details>
|
||||
|
||||
## [`llama-run`](examples/run)
|
||||
## [`llama-run`](tools/run)
|
||||
|
||||
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
|
||||
|
||||
@@ -503,8 +505,8 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||
|
||||
## Other documentation
|
||||
|
||||
- [main (cli)](examples/main/README.md)
|
||||
- [server](examples/server/README.md)
|
||||
- [main (cli)](tools/main/README.md)
|
||||
- [server](tools/server/README.md)
|
||||
- [GBNF grammars](grammars/README.md)
|
||||
|
||||
#### Development documentation
|
||||
|
||||
@@ -40,7 +40,8 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
|
||||
### Untrusted environments or networks
|
||||
|
||||
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
|
||||
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
|
||||
* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
|
||||
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
|
||||
* Encrypt your data if sending it over the network.
|
||||
|
||||
### Multi-Tenant environments
|
||||
|
||||
@@ -8,6 +8,7 @@ TVOS_MIN_OS_VERSION=16.4
|
||||
|
||||
BUILD_SHARED_LIBS=OFF
|
||||
LLAMA_BUILD_EXAMPLES=OFF
|
||||
LLAMA_BUILD_TOOLS=OFF
|
||||
LLAMA_BUILD_TESTS=OFF
|
||||
LLAMA_BUILD_SERVER=OFF
|
||||
GGML_METAL=ON
|
||||
@@ -31,6 +32,7 @@ COMMON_CMAKE_ARGS=(
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
|
||||
-DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
|
||||
-DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
|
||||
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
|
||||
-DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
|
||||
-DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
|
||||
|
||||
@@ -187,8 +187,8 @@ function gg_run_test_scripts_debug {
|
||||
|
||||
set -e
|
||||
|
||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
(cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
(cd ./tools/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
|
||||
set +e
|
||||
}
|
||||
@@ -211,8 +211,8 @@ function gg_run_test_scripts_release {
|
||||
|
||||
set -e
|
||||
|
||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
(cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
(cd ./tools/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
set( CMAKE_SYSTEM_NAME Windows )
|
||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||
|
||||
set( target arm64-pc-windows-msvc )
|
||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||
@@ -41,14 +41,20 @@ endif()
|
||||
|
||||
if(MSVC)
|
||||
set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
|
||||
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
|
||||
if (CMAKE_VS_PLATFORM_NAME)
|
||||
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
|
||||
else()
|
||||
set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
endif()
|
||||
else()
|
||||
execute_process(
|
||||
COMMAND sh -c "\"$@\" --version | head -1" _ ${CMAKE_C_COMPILER}
|
||||
COMMAND ${CMAKE_C_COMPILER} --version
|
||||
OUTPUT_VARIABLE OUT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
|
||||
set(BUILD_COMPILER ${OUT})
|
||||
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_C_COMPILER} -dumpmachine
|
||||
OUTPUT_VARIABLE OUT
|
||||
|
||||
@@ -3,9 +3,3 @@ set( CMAKE_SYSTEM_PROCESSOR x86_64 )
|
||||
|
||||
set( CMAKE_C_COMPILER clang )
|
||||
set( CMAKE_CXX_COMPILER clang++ )
|
||||
|
||||
set( arch_c_flags "-march=native" )
|
||||
|
||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
|
||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
|
||||
|
||||
|
||||
@@ -39,7 +39,9 @@ add_custom_command(
|
||||
COMMENT "Generating build details from Git"
|
||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
||||
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
|
||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
|
||||
-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
|
||||
-P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
|
||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||
VERBATIM
|
||||
@@ -142,3 +144,27 @@ endif ()
|
||||
target_include_directories(${TARGET} PUBLIC .)
|
||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||
|
||||
|
||||
#
|
||||
# copy the license files
|
||||
#
|
||||
|
||||
# Check if running in GitHub Actions
|
||||
if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
|
||||
message(STATUS "Running inside GitHub Actions - copying license files")
|
||||
|
||||
# Copy all files from licenses/ to build/bin/
|
||||
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
|
||||
foreach(LICENSE_FILE ${LICENSE_FILES})
|
||||
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
|
||||
add_custom_command(
|
||||
POST_BUILD
|
||||
TARGET ${TARGET}
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
"${LICENSE_FILE}"
|
||||
"$<TARGET_FILE_DIR:llama>/${FILENAME}"
|
||||
COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
|
||||
message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
370
common/arg.cpp
370
common/arg.cpp
@@ -38,6 +38,30 @@
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
std::initializer_list<enum llama_example> mmproj_examples = {
|
||||
LLAMA_EXAMPLE_LLAVA,
|
||||
LLAMA_EXAMPLE_SERVER,
|
||||
};
|
||||
|
||||
static std::string read_file(const std::string & fname) {
|
||||
std::ifstream file(fname);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
|
||||
}
|
||||
std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
||||
file.close();
|
||||
return content;
|
||||
}
|
||||
|
||||
static void write_file(const std::string & fname, const std::string & content) {
|
||||
std::ofstream file(fname);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
|
||||
}
|
||||
file << content;
|
||||
file.close();
|
||||
}
|
||||
|
||||
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
|
||||
this->examples = std::move(examples);
|
||||
return *this;
|
||||
@@ -157,6 +181,10 @@ struct common_hf_file_res {
|
||||
|
||||
#ifdef LLAMA_USE_CURL
|
||||
|
||||
bool common_has_curl() {
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifdef __linux__
|
||||
#include <linux/limits.h>
|
||||
#elif defined(_WIN32)
|
||||
@@ -189,11 +217,11 @@ struct curl_slist_ptr {
|
||||
#define CURL_MAX_RETRY 3
|
||||
#define CURL_RETRY_DELAY_SECONDS 2
|
||||
|
||||
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
|
||||
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds, const char * method_name) {
|
||||
int remaining_attempts = max_attempts;
|
||||
|
||||
while (remaining_attempts > 0) {
|
||||
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
||||
LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method_name, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
if (res == CURLE_OK) {
|
||||
@@ -204,6 +232,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
|
||||
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
||||
|
||||
remaining_attempts--;
|
||||
if (remaining_attempts == 0) break;
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
||||
}
|
||||
|
||||
@@ -222,8 +251,6 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
return false;
|
||||
}
|
||||
|
||||
bool force_download = false;
|
||||
|
||||
// Set the URL, allow to follow http redirection
|
||||
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
||||
@@ -247,7 +274,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
|
||||
// If the file exists, check its JSON metadata companion file.
|
||||
std::string metadata_path = path + ".json";
|
||||
nlohmann::json metadata;
|
||||
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
|
||||
@@ -257,14 +284,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
if (metadata_in.good()) {
|
||||
try {
|
||||
metadata_in >> metadata;
|
||||
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
||||
if (metadata.contains("url") && metadata.at("url").is_string()) {
|
||||
auto previous_url = metadata.at("url").get<std::string>();
|
||||
if (previous_url != url) {
|
||||
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
||||
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
||||
etag = metadata.at("etag");
|
||||
}
|
||||
@@ -272,10 +292,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
last_modified = metadata.at("lastModified");
|
||||
}
|
||||
} catch (const nlohmann::json::exception & e) {
|
||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||
return false;
|
||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||
}
|
||||
}
|
||||
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
||||
} else {
|
||||
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||
}
|
||||
@@ -287,7 +307,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
};
|
||||
|
||||
common_load_model_from_url_headers headers;
|
||||
bool head_request_ok = false;
|
||||
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
||||
|
||||
// get ETag to see if the remote file has changed
|
||||
{
|
||||
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
||||
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||
@@ -316,23 +339,28 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
||||
// we only allow retrying once for HEAD requests
|
||||
// this is for the use case of using running offline (no internet), retrying can be annoying
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
|
||||
if (!was_perform_successful) {
|
||||
return false;
|
||||
head_request_ok = false;
|
||||
}
|
||||
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code != 200) {
|
||||
// HEAD not supported, we don't know if the file has changed
|
||||
// force trigger downloading
|
||||
force_download = true;
|
||||
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||
if (http_code == 200) {
|
||||
head_request_ok = true;
|
||||
} else {
|
||||
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||
head_request_ok = false;
|
||||
}
|
||||
}
|
||||
|
||||
bool should_download = !file_exists || force_download;
|
||||
if (!should_download) {
|
||||
// if head_request_ok is false, we don't have the etag or last-modified headers
|
||||
// we leave should_download as-is, which is true if the file does not exist
|
||||
if (head_request_ok) {
|
||||
// check if ETag or Last-Modified headers are different
|
||||
// if it is, we need to download the file again
|
||||
if (!etag.empty() && etag != headers.etag) {
|
||||
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
||||
should_download = true;
|
||||
@@ -341,6 +369,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
should_download = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (should_download) {
|
||||
std::string path_temporary = path + ".downloadInProgress";
|
||||
if (file_exists) {
|
||||
@@ -394,7 +423,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
// start the download
|
||||
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
||||
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS, "GET");
|
||||
if (!was_perform_successful) {
|
||||
return false;
|
||||
}
|
||||
@@ -415,13 +444,15 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
{"etag", headers.etag},
|
||||
{"lastModified", headers.last_modified}
|
||||
});
|
||||
std::ofstream(metadata_path) << metadata.dump(4);
|
||||
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
||||
write_file(metadata_path, metadata.dump(4));
|
||||
LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
||||
|
||||
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -522,6 +553,50 @@ static bool common_download_model(
|
||||
return true;
|
||||
}
|
||||
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_slist_ptr http_headers;
|
||||
std::vector<char> res_buffer;
|
||||
|
||||
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
||||
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
||||
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
||||
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
||||
auto data_vec = static_cast<std::vector<char> *>(data);
|
||||
data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
|
||||
return size * nmemb;
|
||||
};
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
|
||||
#if defined(_WIN32)
|
||||
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||
#endif
|
||||
if (params.timeout > 0) {
|
||||
curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
|
||||
}
|
||||
if (params.max_size > 0) {
|
||||
curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
|
||||
}
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
||||
for (const auto & header : params.headers) {
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
|
||||
}
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl.get());
|
||||
|
||||
if (res != CURLE_OK) {
|
||||
std::string error_msg = curl_easy_strerror(res);
|
||||
throw std::runtime_error("error: cannot make GET request: " + error_msg);
|
||||
}
|
||||
|
||||
long res_code;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
||||
|
||||
return { res_code, std::move(res_buffer) };
|
||||
}
|
||||
|
||||
/**
|
||||
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
||||
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
||||
@@ -541,46 +616,48 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
||||
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
|
||||
}
|
||||
|
||||
// fetch model info from Hugging Face Hub API
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_slist_ptr http_headers;
|
||||
std::string res_str;
|
||||
std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
|
||||
|
||||
std::string model_endpoint = get_model_endpoint();
|
||||
|
||||
std::string url = model_endpoint + "v2/" + hf_repo + "/manifests/" + tag;
|
||||
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
||||
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
||||
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
||||
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
|
||||
return size * nmemb;
|
||||
};
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
|
||||
#if defined(_WIN32)
|
||||
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||
#endif
|
||||
// headers
|
||||
std::vector<std::string> headers;
|
||||
headers.push_back("Accept: application/json");
|
||||
if (!bearer_token.empty()) {
|
||||
std::string auth_header = "Authorization: Bearer " + bearer_token;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
||||
headers.push_back("Authorization: Bearer " + bearer_token);
|
||||
}
|
||||
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
// User-Agent header is already set in common_remote_get_content, no need to set it here
|
||||
|
||||
CURLcode res = curl_easy_perform(curl.get());
|
||||
// we use "=" to avoid clashing with other component, while still being allowed on windows
|
||||
std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
|
||||
string_replace_all(cached_response_fname, "/", "_");
|
||||
std::string cached_response_path = fs_get_cache_file(cached_response_fname);
|
||||
|
||||
if (res != CURLE_OK) {
|
||||
throw std::runtime_error("error: cannot make GET request to HF API");
|
||||
// make the request
|
||||
common_remote_params params;
|
||||
params.headers = headers;
|
||||
long res_code = 0;
|
||||
std::string res_str;
|
||||
bool use_cache = false;
|
||||
try {
|
||||
auto res = common_remote_get_content(url, params);
|
||||
res_code = res.first;
|
||||
res_str = std::string(res.second.data(), res.second.size());
|
||||
} catch (const std::exception & e) {
|
||||
LOG_WRN("error: failed to get manifest: %s\n", e.what());
|
||||
LOG_WRN("try reading from cache\n");
|
||||
// try to read from cache
|
||||
try {
|
||||
res_str = read_file(cached_response_path);
|
||||
res_code = 200;
|
||||
use_cache = true;
|
||||
} catch (const std::exception & e) {
|
||||
throw std::runtime_error("error: failed to get manifest (check your internet connection)");
|
||||
}
|
||||
}
|
||||
std::string ggufFile;
|
||||
std::string mmprojFile;
|
||||
|
||||
long res_code;
|
||||
std::string ggufFile = "";
|
||||
std::string mmprojFile = "";
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
||||
if (res_code == 200) {
|
||||
if (res_code == 200 || res_code == 304) {
|
||||
// extract ggufFile.rfilename in json, using regex
|
||||
{
|
||||
std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
|
||||
@@ -597,6 +674,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
||||
mmprojFile = match[1].str();
|
||||
}
|
||||
}
|
||||
if (!use_cache) {
|
||||
// if not using cached response, update the cache file
|
||||
write_file(cached_response_path, res_str);
|
||||
}
|
||||
} else if (res_code == 401) {
|
||||
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
||||
} else {
|
||||
@@ -613,6 +694,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
||||
|
||||
#else
|
||||
|
||||
bool common_has_curl() {
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
|
||||
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
||||
return false;
|
||||
@@ -635,17 +720,30 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s
|
||||
return {};
|
||||
}
|
||||
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
|
||||
if (!url.empty()) {
|
||||
throw std::runtime_error("error: built without CURL, cannot download model from the internet");
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
#endif // LLAMA_USE_CURL
|
||||
|
||||
//
|
||||
// utils
|
||||
//
|
||||
|
||||
static void common_params_handle_model(
|
||||
struct handle_model_result {
|
||||
bool found_mmproj = false;
|
||||
common_params_model mmproj;
|
||||
};
|
||||
|
||||
static handle_model_result common_params_handle_model(
|
||||
struct common_params_model & model,
|
||||
const std::string & bearer_token,
|
||||
const std::string & model_path_default,
|
||||
bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files?
|
||||
const std::string & model_path_default) {
|
||||
handle_model_result result;
|
||||
// handle pre-fill default model path and url based on hf_repo and hf_file
|
||||
{
|
||||
if (!model.hf_repo.empty()) {
|
||||
@@ -657,7 +755,12 @@ static void common_params_handle_model(
|
||||
exit(1); // built without CURL, error message already printed
|
||||
}
|
||||
model.hf_repo = auto_detected.repo;
|
||||
model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile;
|
||||
model.hf_file = auto_detected.ggufFile;
|
||||
if (!auto_detected.mmprojFile.empty()) {
|
||||
result.found_mmproj = true;
|
||||
result.mmproj.hf_repo = model.hf_repo;
|
||||
result.mmproj.hf_file = auto_detected.mmprojFile;
|
||||
}
|
||||
} else {
|
||||
model.hf_file = model.path;
|
||||
}
|
||||
@@ -694,6 +797,8 @@ static void common_params_handle_model(
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const std::vector<ggml_type> kv_cache_types = {
|
||||
@@ -827,16 +932,25 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
||||
}
|
||||
|
||||
common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
|
||||
common_params_handle_model(params.speculative.model, params.hf_token, "");
|
||||
common_params_handle_model(params.vocoder.model, params.hf_token, "");
|
||||
|
||||
// allow --mmproj to be set from -hf
|
||||
// assuming that mmproj is always in the same repo as text model
|
||||
if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
|
||||
params.mmproj.hf_repo = params.model.hf_repo;
|
||||
// handle model and download
|
||||
{
|
||||
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
|
||||
if (params.no_mmproj) {
|
||||
params.mmproj = {};
|
||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||
// optionally, handle mmproj model when -hf is specified
|
||||
params.mmproj = res.mmproj;
|
||||
}
|
||||
// only download mmproj if the current example is using it
|
||||
for (auto & ex : mmproj_examples) {
|
||||
if (ctx_arg.ex == ex) {
|
||||
common_params_handle_model(params.mmproj, params.hf_token, "");
|
||||
break;
|
||||
}
|
||||
}
|
||||
common_params_handle_model(params.speculative.model, params.hf_token, "");
|
||||
common_params_handle_model(params.vocoder.model, params.hf_token, "");
|
||||
}
|
||||
common_params_handle_model(params.mmproj, params.hf_token, "", true);
|
||||
|
||||
if (params.escape) {
|
||||
string_process_escapes(params.prompt);
|
||||
@@ -968,7 +1082,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
||||
"llama-embedding",
|
||||
"llama-eval-callback",
|
||||
"llama-export-lora",
|
||||
"llama-gbnf-validator",
|
||||
"llama-gen-docs",
|
||||
"llama-gguf",
|
||||
"llama-gguf-hash",
|
||||
@@ -976,20 +1089,18 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
||||
"llama-gritlm",
|
||||
"llama-imatrix",
|
||||
"llama-infill",
|
||||
"llama-llava-cli",
|
||||
"llama-mtmd-cli",
|
||||
"llama-llava-clip-quantize-cli",
|
||||
"llama-lookahead",
|
||||
"llama-lookup",
|
||||
"llama-lookup-create",
|
||||
"llama-lookup-merge",
|
||||
"llama-lookup-stats",
|
||||
"llama-minicpmv-cli",
|
||||
"llama-parallel",
|
||||
"llama-passkey",
|
||||
"llama-perplexity",
|
||||
"llama-q8dot",
|
||||
"llama-quantize",
|
||||
"llama-quantize-stats",
|
||||
"llama-qwen2vl-cli",
|
||||
"llama-retrieval",
|
||||
"llama-run",
|
||||
@@ -1078,6 +1189,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
||||
fprintf(stderr, "%s\n", ex.what());
|
||||
ctx_arg.params = params_org;
|
||||
return false;
|
||||
} catch (std::exception & ex) {
|
||||
fprintf(stderr, "%s\n", ex.what());
|
||||
exit(1); // for other exceptions, we exit with status code 1
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -1169,7 +1283,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.use_color = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
||||
add_opt(common_arg(
|
||||
{"-t", "--threads"}, "N",
|
||||
string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
||||
@@ -1302,7 +1416,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
add_opt(common_arg(
|
||||
{"-n", "--predict", "--n-predict"}, "N",
|
||||
string_format(
|
||||
ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
|
||||
ex == LLAMA_EXAMPLE_MAIN
|
||||
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
|
||||
: "number of tokens to predict (default: %d, -1 = infinity)",
|
||||
params.n_predict),
|
||||
@@ -1378,13 +1492,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"-f", "--file"}, "FNAME",
|
||||
"a file containing the prompt (default: none)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream file(value);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||
}
|
||||
params.prompt = read_file(value);
|
||||
// store the external file name in params
|
||||
params.prompt_file = value;
|
||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||
if (!params.prompt.empty() && params.prompt.back() == '\n') {
|
||||
params.prompt.pop_back();
|
||||
}
|
||||
@@ -1394,11 +1504,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"-sysf", "--system-prompt-file"}, "FNAME",
|
||||
"a file containing the system prompt (default: none)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream file(value);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||
}
|
||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
|
||||
params.system_prompt = read_file(value);
|
||||
if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
|
||||
params.system_prompt.pop_back();
|
||||
}
|
||||
@@ -1549,7 +1655,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.input_prefix = value;
|
||||
params.enable_chat_template = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"--in-suffix"}, "STRING",
|
||||
"string to suffix after user inputs with (default: empty)",
|
||||
@@ -1557,7 +1663,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.input_suffix = value;
|
||||
params.enable_chat_template = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"--no-warmup"},
|
||||
"skip warming up the model with an empty run",
|
||||
@@ -1574,7 +1680,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.spm_infill = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--samplers"}, "SAMPLERS",
|
||||
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
||||
@@ -1823,15 +1929,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--grammar-file"}, "FNAME",
|
||||
"file to read grammar from",
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream file(value);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||
}
|
||||
std::copy(
|
||||
std::istreambuf_iterator<char>(file),
|
||||
std::istreambuf_iterator<char>(),
|
||||
std::back_inserter(params.sampling.grammar)
|
||||
);
|
||||
params.sampling.grammar = read_file(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1841,6 +1939,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.sampling.grammar = json_schema_to_grammar(json::parse(value));
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"-jf", "--json-schema-file"}, "FILE",
|
||||
"File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream file(value);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||
}
|
||||
std::string schema;
|
||||
std::copy(
|
||||
std::istreambuf_iterator<char>(file),
|
||||
std::istreambuf_iterator<char>(),
|
||||
std::back_inserter(schema)
|
||||
);
|
||||
params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--pooling"}, "{none,mean,cls,last,rank}",
|
||||
"pooling type for embeddings, use model default if unspecified",
|
||||
@@ -1982,13 +2097,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.cache_type_v = kv_cache_type_from_str(value);
|
||||
}
|
||||
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
||||
add_opt(common_arg(
|
||||
{"--perplexity", "--all-logits"},
|
||||
string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.logits_all = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
||||
add_opt(common_arg(
|
||||
{"--hellaswag"},
|
||||
"compute HellaSwag score over random tasks from datafile supplied with -f",
|
||||
@@ -2096,18 +2204,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj"}, "FILE",
|
||||
"path to a multimodal projector file for LLaVA. see examples/llava/README.md",
|
||||
"path to a multimodal projector file. see tools/mtmd/README.md",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj.path = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
||||
).set_examples(mmproj_examples));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj-url"}, "URL",
|
||||
"URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
|
||||
"URL to a multimodal projector file. see tools/mtmd/README.md",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj.url = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
||||
).set_examples(mmproj_examples));
|
||||
add_opt(common_arg(
|
||||
{"--no-mmproj"},
|
||||
"explicitly disable multimodal projector, useful when using -hf",
|
||||
[](common_params & params) {
|
||||
params.no_mmproj = true;
|
||||
}
|
||||
).set_examples(mmproj_examples));
|
||||
add_opt(common_arg(
|
||||
{"--no-mmproj-offload"},
|
||||
"do not offload multimodal projector to GPU",
|
||||
[](common_params & params) {
|
||||
params.mmproj_use_gpu = false;
|
||||
}
|
||||
).set_examples(mmproj_examples));
|
||||
add_opt(common_arg(
|
||||
{"--image"}, "FILE",
|
||||
"path to an image file. use with multimodal models. Specify multiple times for batching",
|
||||
@@ -2382,6 +2504,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
add_opt(common_arg(
|
||||
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
|
||||
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
|
||||
"mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
|
||||
"example: unsloth/phi-4-GGUF:q4_k_m\n"
|
||||
"(default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
@@ -2504,6 +2627,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.i_chunk = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||
add_opt(common_arg(
|
||||
{"--parse-special"},
|
||||
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.parse_special = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||
add_opt(common_arg(
|
||||
{"-pps"},
|
||||
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
|
||||
@@ -2653,7 +2783,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
|
||||
add_opt(common_arg(
|
||||
{"--cache-reuse"}, "N",
|
||||
string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
|
||||
string_format(
|
||||
"min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
|
||||
"[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
|
||||
),
|
||||
[](common_params & params, int value) {
|
||||
params.n_cache_reuse = value;
|
||||
}
|
||||
@@ -2726,7 +2859,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.chat_template = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
||||
add_opt(common_arg(
|
||||
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
|
||||
string_format(
|
||||
@@ -2736,14 +2869,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
|
||||
),
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream file(value);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||
}
|
||||
std::copy(
|
||||
std::istreambuf_iterator<char>(file),
|
||||
std::istreambuf_iterator<char>(),
|
||||
std::back_inserter(params.chat_template));
|
||||
params.chat_template = read_file(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
||||
add_opt(common_arg(
|
||||
@@ -2766,7 +2892,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.simple_io = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"--positive-file"}, "FNAME",
|
||||
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
||||
|
||||
@@ -78,3 +78,12 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
||||
|
||||
// function to be used by test-arg-parser
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
bool common_has_curl();
|
||||
|
||||
struct common_remote_params {
|
||||
std::vector<std::string> headers;
|
||||
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
|
||||
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
|
||||
};
|
||||
// get remote file content, returns <http_code, raw_response_body>
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
|
||||
|
||||
@@ -125,7 +125,9 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||
msgs.push_back(msg);
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
throw std::runtime_error("Failed to parse messages: " + std::string(e.what()) + "; messages = " + messages.dump(2));
|
||||
// @ngxson : disable otherwise it's bloating the API response
|
||||
// printf("%s\n", std::string("; messages = ") + messages.dump(2));
|
||||
throw std::runtime_error("Failed to parse messages: " + std::string(e.what()));
|
||||
}
|
||||
|
||||
return msgs;
|
||||
|
||||
@@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.n_threads = params.cpuparams.n_threads;
|
||||
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
||||
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
||||
cparams.logits_all = params.logits_all;
|
||||
cparams.embeddings = params.embedding;
|
||||
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||
cparams.rope_freq_base = params.rope_freq_base;
|
||||
|
||||
@@ -66,7 +66,6 @@ enum llama_example {
|
||||
LLAMA_EXAMPLE_COMMON,
|
||||
LLAMA_EXAMPLE_SPECULATIVE,
|
||||
LLAMA_EXAMPLE_MAIN,
|
||||
LLAMA_EXAMPLE_INFILL,
|
||||
LLAMA_EXAMPLE_EMBEDDING,
|
||||
LLAMA_EXAMPLE_PERPLEXITY,
|
||||
LLAMA_EXAMPLE_RETRIEVAL,
|
||||
@@ -96,6 +95,7 @@ enum common_sampler_type {
|
||||
COMMON_SAMPLER_TYPE_XTC = 8,
|
||||
COMMON_SAMPLER_TYPE_INFILL = 9,
|
||||
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
||||
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
|
||||
};
|
||||
|
||||
// dimensionality reduction methods, used by cvector-generator
|
||||
@@ -161,6 +161,7 @@ struct common_params_sampling {
|
||||
std::vector<enum common_sampler_type> samplers = {
|
||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||
COMMON_SAMPLER_TYPE_DRY,
|
||||
COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
|
||||
COMMON_SAMPLER_TYPE_TOP_K,
|
||||
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
||||
COMMON_SAMPLER_TYPE_TOP_P,
|
||||
@@ -323,7 +324,6 @@ struct common_params {
|
||||
bool ctx_shift = true; // context shift on inifinite text generation
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool logits_all = false; // return logits for all tokens in the batch
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
@@ -340,8 +340,10 @@ struct common_params {
|
||||
|
||||
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
||||
|
||||
// multimodal models (see examples/llava)
|
||||
// multimodal models (see tools/mtmd)
|
||||
struct common_params_model mmproj;
|
||||
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
||||
bool no_mmproj = false; // explicitly disable multimodal model
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
|
||||
// embedding
|
||||
@@ -407,13 +409,14 @@ struct common_params {
|
||||
|
||||
bool process_output = false; // collect data for the output tensor
|
||||
bool compute_ppl = true; // whether to compute perplexity
|
||||
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
|
||||
|
||||
// cvector-generator params
|
||||
int n_pca_batch = 100;
|
||||
int n_pca_iterations = 1000;
|
||||
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||
std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
|
||||
std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
|
||||
|
||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||
|
||||
|
||||
@@ -16,6 +16,9 @@ using json = nlohmann::ordered_json;
|
||||
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
|
||||
auto has_max = max_items != std::numeric_limits<int>::max();
|
||||
|
||||
if (max_items == 0) {
|
||||
return "";
|
||||
}
|
||||
if (min_items == 0 && max_items == 1) {
|
||||
return item_rule + "?";
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "sampling.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <unordered_map>
|
||||
@@ -229,51 +230,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
params.logit_bias.data()));
|
||||
|
||||
if (params.mirostat == 0) {
|
||||
if (params.top_n_sigma >= 0) {
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||
} else {
|
||||
for (const auto & cnstr : params.samplers) {
|
||||
switch (cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_DRY:
|
||||
{
|
||||
std::vector<const char *> c_breakers;
|
||||
c_breakers.reserve(params.dry_sequence_breakers.size());
|
||||
for (const auto & str : params.dry_sequence_breakers) {
|
||||
c_breakers.push_back(str.c_str());
|
||||
}
|
||||
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
for (const auto & cnstr : params.samplers) {
|
||||
switch (cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_DRY:
|
||||
{
|
||||
std::vector<const char *> c_breakers;
|
||||
c_breakers.reserve(params.dry_sequence_breakers.size());
|
||||
for (const auto & str : params.dry_sequence_breakers) {
|
||||
c_breakers.push_back(str.c_str());
|
||||
}
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_XTC:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
}
|
||||
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
}
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_XTC:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
}
|
||||
}
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||
@@ -475,6 +473,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
||||
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
|
||||
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
||||
@@ -490,6 +489,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
||||
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
|
||||
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
||||
@@ -504,6 +504,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
||||
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
@@ -517,6 +518,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
||||
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
@@ -533,14 +535,16 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
auto sampler = sampler_canonical_name_map.find(name);
|
||||
if (sampler != sampler_canonical_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
} else {
|
||||
if (allow_alt_names) {
|
||||
sampler = sampler_alt_name_map.find(name);
|
||||
if (sampler != sampler_alt_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (allow_alt_names) {
|
||||
sampler = sampler_alt_name_map.find(name);
|
||||
if (sampler != sampler_alt_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
|
||||
}
|
||||
|
||||
return samplers;
|
||||
@@ -552,6 +556,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
||||
@@ -566,6 +571,8 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||
const auto sampler = sampler_name_map.find(c);
|
||||
if (sampler != sampler_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
} else {
|
||||
LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -115,6 +115,7 @@ models = [
|
||||
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
|
||||
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
|
||||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
|
||||
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
import gguf
|
||||
|
||||
# reuse model definitions from convert_hf_to_gguf.py
|
||||
from convert_hf_to_gguf import LazyTorchTensor, Model
|
||||
from convert_hf_to_gguf import LazyTorchTensor, ModelBase
|
||||
|
||||
logger = logging.getLogger("lora-to-gguf")
|
||||
|
||||
@@ -340,11 +340,11 @@ if __name__ == '__main__':
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||
hparams = Model.load_hparams(dir_base_model)
|
||||
hparams = ModelBase.load_hparams(dir_base_model)
|
||||
|
||||
with torch.inference_mode():
|
||||
try:
|
||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||
model_class = ModelBase.from_model_architecture(hparams["architectures"][0])
|
||||
except NotImplementedError:
|
||||
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -9,10 +9,10 @@ Adding a model requires few steps:
|
||||
After following these steps, you can open PR.
|
||||
|
||||
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
|
||||
- [main](/examples/main/)
|
||||
- [imatrix](/examples/imatrix/)
|
||||
- [quantize](/examples/quantize/)
|
||||
- [server](/examples/server/)
|
||||
- [main](/tools/main/)
|
||||
- [imatrix](/tools/imatrix/)
|
||||
- [quantize](/tools/quantize/)
|
||||
- [server](/tools/server/)
|
||||
|
||||
### 1. Convert the model to GGUF
|
||||
|
||||
|
||||
69
docs/multimodal.md
Normal file
69
docs/multimodal.md
Normal file
@@ -0,0 +1,69 @@
|
||||
# Multimodal
|
||||
|
||||
llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools support this feature:
|
||||
- [llama-mtmd-cli](../tools/mtmd/README.md)
|
||||
- [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
|
||||
|
||||
To enable it, can use use one of the 2 methods below:
|
||||
|
||||
- Use `-hf` option with a [supported model](../../docs/multimodal.md)
|
||||
- To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
|
||||
- To load a model using `-hf` while using a custom mmproj file, use `--mmproj local_file.gguf`
|
||||
- Use `-m model.gguf` option with `--mmproj file.gguf` to specify text and multimodal projector respectively
|
||||
|
||||
By default, multimodal projector will be offloaded to GPU. To disable this, add `--no-mmproj-offload`
|
||||
|
||||
For example:
|
||||
|
||||
```sh
|
||||
# simple usage with CLI
|
||||
llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
|
||||
|
||||
# simple usage with server
|
||||
llama-server -hf ggml-org/gemma-3-4b-it-GGUF
|
||||
|
||||
# using local file
|
||||
llama-server -m gemma-3-4b-it-Q4_K_M.gguf --mmproj mmproj-gemma-3-4b-it-Q4_K_M.gguf
|
||||
|
||||
# no GPU offload
|
||||
llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
|
||||
```
|
||||
|
||||
## Pre-quantized models
|
||||
|
||||
These are ready-to-use models, most of them come with `Q4_K_M` quantization by default.
|
||||
|
||||
Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`
|
||||
|
||||
NOTE: some models may require large context window, for example: `-c 8192`
|
||||
|
||||
```sh
|
||||
# Gemma 3
|
||||
(tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
|
||||
(tool_name) -hf ggml-org/gemma-3-12b-it-GGUF
|
||||
(tool_name) -hf ggml-org/gemma-3-27b-it-GGUF
|
||||
|
||||
# SmolVLM
|
||||
(tool_name) -hf ggml-org/SmolVLM-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/SmolVLM-256M-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/SmolVLM-500M-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/SmolVLM2-2.2B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/SmolVLM2-256M-Video-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF
|
||||
|
||||
# Pixtral 12B
|
||||
(tool_name) -hf ggml-org/pixtral-12b-GGUF
|
||||
|
||||
# Qwen 2 VL
|
||||
(tool_name) -hf ggml-org/Qwen2-VL-2B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/Qwen2-VL-7B-Instruct-GGUF
|
||||
|
||||
# Qwen 2.5 VL
|
||||
(tool_name) -hf ggml-org/Qwen2.5-VL-3B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/Qwen2.5-VL-7B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/Qwen2.5-VL-32B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/Qwen2.5-VL-72B-Instruct-GGUF
|
||||
|
||||
# Mistral Small 3.1 24B (IQ2_M quantization)
|
||||
(tool_name) -hf ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF
|
||||
```
|
||||
@@ -9,15 +9,15 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
|
||||
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
|
||||
|
||||
## Usage
|
||||
Build with cmake or run `make llama-llava-cli` to build it.
|
||||
|
||||
After building, run: `./llama-llava-cli` to see the usage. For example:
|
||||
Build the `llama-mtmd-cli` binary.
|
||||
|
||||
After building, run: `./llama-mtmd-cli` to see the usage. For example:
|
||||
|
||||
```sh
|
||||
./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
|
||||
./llama-mtmd-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
|
||||
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
|
||||
--image path/to/an/image.jpg \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
|
||||
--chat-template deepseek
|
||||
```
|
||||
|
||||
## Model conversion
|
||||
@@ -33,13 +33,13 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||
2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
|
||||
python ./tools/mtmd/llava_surgery.py -m path/to/MobileVLM-1.7B
|
||||
```
|
||||
|
||||
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py \
|
||||
python ./tools/mtmd/convert_image_encoder_to_gguf.py \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B \
|
||||
@@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf.py \
|
||||
```
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py \
|
||||
python ./tools/mtmd/convert_image_encoder_to_gguf.py \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B_V2 \
|
||||
@@ -69,10 +69,10 @@ Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directo
|
||||
|
||||
## Android compile and run
|
||||
### compile
|
||||
refer to `examples/llava/android/build_64.sh`
|
||||
refer to `tools/mtmd/android/build_64.sh`
|
||||
```sh
|
||||
mkdir examples/llava/android/build_64
|
||||
cd examples/llava/android/build_64
|
||||
mkdir tools/mtmd/android/build_64
|
||||
cd tools/mtmd/android/build_64
|
||||
../build_64.sh
|
||||
```
|
||||
### run on Android
|
||||
@@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
|
||||
### case 1
|
||||
**input**
|
||||
```sh
|
||||
/data/local/tmp/llama-llava-cli \
|
||||
/data/local/tmp/llama-mtmd-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-t 4 \
|
||||
@@ -102,7 +102,7 @@ llama_print_timings: total time = 34731.93 ms
|
||||
### case 2
|
||||
**input**
|
||||
```sh
|
||||
/data/local/tmp/llama-llava-cli \
|
||||
/data/local/tmp/llama-mtmd-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-t 4 \
|
||||
@@ -123,10 +123,10 @@ llama_print_timings: total time = 34570.79 ms
|
||||
|
||||
## Some result on Android with `Snapdragon 778G` chip
|
||||
### MobileVLM-1.7B case
|
||||
#### llava-cli release-b2005
|
||||
#### mtmd-cli release-b2005
|
||||
**input**
|
||||
```sh
|
||||
/data/local/tmp/llama-llava-cli \
|
||||
/data/local/tmp/llama-mtmd-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-t 4 \
|
||||
@@ -147,7 +147,7 @@ llama_print_timings: prompt eval time = 8119.49 ms / 191 tokens ( 42.51 m
|
||||
llama_print_timings: eval time = 1005.75 ms / 14 runs ( 71.84 ms per token, 13.92 tokens per second)
|
||||
llama_print_timings: total time = 28038.34 ms / 205 tokens
|
||||
```
|
||||
#### llava-cli latest-version
|
||||
#### mtmd-cli latest-version
|
||||
**input**
|
||||
|
||||
Just the same as above.
|
||||
@@ -169,7 +169,7 @@ llama_print_timings: eval time = 43894.02 ms / 13 runs ( 3376.46 m
|
||||
llama_print_timings: total time = 865441.76 ms / 204 tokens
|
||||
```
|
||||
### MobileVLM_V2-1.7B case
|
||||
#### llava-cli release-2005b
|
||||
#### mtmd-cli release-2005b
|
||||
**input**
|
||||
|
||||
Just the same as above.
|
||||
@@ -200,7 +200,7 @@ make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
|
||||
### case 1
|
||||
**input**
|
||||
```sh
|
||||
./llama-llava-cli \
|
||||
./llama-mtmd-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
--image /data/local/tmp/demo.jpeg \
|
||||
@@ -224,7 +224,7 @@ llama_print_timings: total time = 1352.63 ms / 252 tokens
|
||||
### case 2
|
||||
**input**
|
||||
```sh
|
||||
./llama-llava-cli \
|
||||
./llama-mtmd-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
|
||||
@@ -11,26 +11,27 @@ You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)
|
||||
```bash
|
||||
# build
|
||||
cmake -B build
|
||||
cmake --build build --target llama-gemma3-cli
|
||||
cmake --build build --target llama-mtmd-cli
|
||||
|
||||
# alternatively, install from brew (MacOS)
|
||||
brew install llama.cpp
|
||||
|
||||
# run it
|
||||
llama-gemma3-cli -hf ggml-org/gemma-3-4b-it-GGUF
|
||||
llama-gemma3-cli -hf ggml-org/gemma-3-12b-it-GGUF
|
||||
llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF
|
||||
llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
|
||||
llama-mtmd-cli -hf ggml-org/gemma-3-12b-it-GGUF
|
||||
llama-mtmd-cli -hf ggml-org/gemma-3-27b-it-GGUF
|
||||
|
||||
# note: 1B model does not support vision
|
||||
```
|
||||
|
||||
## How to get mmproj.gguf?
|
||||
|
||||
Simply to add `--mmproj` in when converting model via `convert_hf_to_gguf.py`:
|
||||
|
||||
```bash
|
||||
cd gemma-3-4b-it
|
||||
python ../llama.cpp/examples/llava/gemma3_convert_encoder_to_gguf.py .
|
||||
|
||||
# output file is mmproj.gguf
|
||||
python ../llama.cpp/convert_hf_to_gguf.py --outfile model.gguf --outtype f16 --mmproj .
|
||||
# output file: mmproj-model.gguf
|
||||
```
|
||||
|
||||
## How to run it?
|
||||
@@ -43,8 +44,8 @@ What you need:
|
||||
```bash
|
||||
# build
|
||||
cmake -B build
|
||||
cmake --build build --target llama-gemma3-cli
|
||||
cmake --build build --target llama-mtmd-cli
|
||||
|
||||
# run it
|
||||
./build/bin/llama-gemma3-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
|
||||
./build/bin/llama-mtmd-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
|
||||
```
|
||||
@@ -3,12 +3,12 @@
|
||||
Currently this implementation supports [glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b) and [glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b).
|
||||
|
||||
## Usage
|
||||
Build with cmake or run `make llama-llava-cli` to build it.
|
||||
Build the `llama-mtmd-cli` binary.
|
||||
|
||||
After building, run: `./llama-llava-cli` to see the usage. For example:
|
||||
After building, run: `./llama-mtmd-cli` to see the usage. For example:
|
||||
|
||||
```sh
|
||||
./llama-llava-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf --image img_path/image.jpg -p "<|system|>\n system prompt <image><|user|>\n prompt <|assistant|>\n"
|
||||
./llama-mtmd-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf
|
||||
```
|
||||
|
||||
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
||||
@@ -25,13 +25,13 @@ git clone https://huggingface.co/THUDM/glm-edge-v-5b or https://huggingface.co/T
|
||||
2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/glmedge-surgery.py -m ../model_path
|
||||
python ./tools/mtmd/glmedge-surgery.py -m ../model_path
|
||||
```
|
||||
|
||||
4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
|
||||
python ./tools/mtmd/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
|
||||
```
|
||||
|
||||
5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF:
|
||||
@@ -176,15 +176,11 @@ Note that currently you cannot quantize the visual encoder because granite visio
|
||||
|
||||
|
||||
### 5. Running the Model in Llama cpp
|
||||
Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
|
||||
Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
|
||||
|
||||
```bash
|
||||
$ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \
|
||||
$ ./build/bin/llama-mtmd-cli -m $LLM_GGUF_PATH \
|
||||
--mmproj $VISUAL_GGUF_PATH \
|
||||
--image ./media/llama0-banner.png \
|
||||
-c 16384 \
|
||||
-p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\<image>\nWhat does the text in this image say?\n<|assistant|>\n" \
|
||||
--temp 0
|
||||
```
|
||||
|
||||
Sample output: `The text in the image reads "LLAMA C++ Can it run DOOM Llama?"`
|
||||
@@ -11,12 +11,14 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h
|
||||
After API is confirmed, more models will be supported / uploaded.
|
||||
|
||||
## Usage
|
||||
Build with cmake or run `make llama-llava-cli` to build it.
|
||||
Build the `llama-mtmd-cli` binary.
|
||||
|
||||
After building, run: `./llama-llava-cli` to see the usage. For example:
|
||||
After building, run: `./llama-mtmd-cli` to see the usage. For example:
|
||||
|
||||
```sh
|
||||
./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
|
||||
./llama-mtmd-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf \
|
||||
--mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf \
|
||||
--chat-template vicuna
|
||||
```
|
||||
|
||||
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
||||
@@ -35,19 +37,19 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||
2. Install the required Python packages:
|
||||
|
||||
```sh
|
||||
pip install -r examples/llava/requirements.txt
|
||||
pip install -r tools/mtmd/requirements.txt
|
||||
```
|
||||
|
||||
3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
|
||||
python ./tools/mtmd/llava_surgery.py -m ../llava-v1.5-7b
|
||||
```
|
||||
|
||||
4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
||||
python ./tools/mtmd/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
||||
```
|
||||
|
||||
5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||
@@ -67,12 +69,12 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
|
||||
2) Install the required Python packages:
|
||||
|
||||
```sh
|
||||
pip install -r examples/llava/requirements.txt
|
||||
pip install -r tools/mtmd/requirements.txt
|
||||
```
|
||||
|
||||
3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
||||
```console
|
||||
python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
|
||||
python tools/mtmd/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
|
||||
```
|
||||
- you will find a llava.projector and a llava.clip file in your model directory
|
||||
|
||||
@@ -86,7 +88,7 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso
|
||||
|
||||
5) Create the visual gguf model:
|
||||
```console
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
||||
python ./tools/mtmd/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
||||
```
|
||||
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
|
||||
|
||||
@@ -97,7 +99,7 @@ python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknow
|
||||
|
||||
7) And finally we can run the llava cli using the 1.6 model version:
|
||||
```console
|
||||
./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
|
||||
./llama-mtmd-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf
|
||||
```
|
||||
|
||||
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
|
||||
@@ -122,17 +124,9 @@ model.language_model.save_pretrained(llm_export_path)
|
||||
|
||||
Then, you can convert the LLM using the `convert_hf_to_gguf.py` script, which handles more LLM architectures.
|
||||
|
||||
## llava-cli templating and llava-1.6 prompting
|
||||
## Chat template
|
||||
|
||||
llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."`
|
||||
For llava-1.5 models which are not vicuna (mistral and Yi) you need to adapt system prompt as well as user prompt, for this purpose llava-cli has a basic templating system:
|
||||
|
||||
**For Mistral and using llava-cli binary:**
|
||||
Add this: `-p "<image>\nUSER:\nProvide a full description.\nASSISTANT:\n"`
|
||||
The mistral template for llava-1.6 seems to be no system print and a USER/ASSISTANT role
|
||||
|
||||
**For the 34B this should work:**
|
||||
Add this: `-e -p <|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nProvide a full description.<|im_end|><|im_start|>assistant\n`
|
||||
For llava-1.5 and llava-1.6, you need to use `vicuna` chat template. Simply add `--chat-template vicuna` to activate this template.
|
||||
|
||||
|
||||
## How to know if you are running in llava-1.5 or llava-1.6 mode
|
||||
@@ -147,12 +141,3 @@ When running llava-cli you will see a visual information right before the prompt
|
||||
|
||||
|
||||
Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6
|
||||
|
||||
|
||||
|
||||
|
||||
## TODO
|
||||
|
||||
- [x] Support non-CPU backend for the image encoding part.
|
||||
- [ ] Support different sampling methods.
|
||||
- [ ] Support more model variants.
|
||||
@@ -29,8 +29,8 @@ cmake --build build --config Release
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
||||
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
|
||||
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
||||
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
@@ -40,9 +40,9 @@ python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
|
||||
|
||||
Inference on Linux or Mac
|
||||
```bash
|
||||
# run f16 version
|
||||
./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
# run in single-turn mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# run quantized int4 version
|
||||
./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
# run in conversation mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf
|
||||
```
|
||||
@@ -28,8 +28,8 @@ cmake --build build --config Release
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
||||
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
||||
|
||||
# quantize int4 version
|
||||
@@ -39,9 +39,9 @@ python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
||||
|
||||
Inference on Linux or Mac
|
||||
```bash
|
||||
# run f16 version
|
||||
./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
# run in single-turn mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# run quantized int4 version
|
||||
./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
# run in conversation mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf
|
||||
```
|
||||
@@ -28,8 +28,8 @@ cmake --build build --config Release
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
|
||||
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
|
||||
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-V-2_6
|
||||
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
@@ -39,9 +39,9 @@ python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
|
||||
|
||||
Inference on Linux or Mac
|
||||
```bash
|
||||
# run f16 version
|
||||
./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
# run in single-turn mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# run quantized int4 version
|
||||
./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
# run in conversation mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf
|
||||
```
|
||||
@@ -12,60 +12,29 @@ llama_add_compile_flags()
|
||||
|
||||
# examples
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(batched-bench)
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(eval-callback)
|
||||
|
||||
if (NOT WIN32)
|
||||
# disabled on Windows because it uses internal functions not exported with LLAMA_API
|
||||
add_subdirectory(gbnf-validator)
|
||||
endif()
|
||||
|
||||
add_subdirectory(gguf-hash)
|
||||
add_subdirectory(gguf-split)
|
||||
add_subdirectory(gguf)
|
||||
add_subdirectory(gritlm)
|
||||
add_subdirectory(imatrix)
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(lookahead)
|
||||
add_subdirectory(lookup)
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(passkey)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(retrieval)
|
||||
if (LLAMA_BUILD_SERVER)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(run)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(simple-chat)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(speculative-simple)
|
||||
add_subdirectory(tokenize)
|
||||
add_subdirectory(tts)
|
||||
add_subdirectory(gen-docs)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
# these examples use the backends directly and cannot be built with dynamic loading
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(cvector-generator)
|
||||
add_subdirectory(export-lora)
|
||||
if (NOT WIN32)
|
||||
# disabled on Windows because it uses internal functions not exported with LLAMA_API
|
||||
add_subdirectory(quantize-stats)
|
||||
endif()
|
||||
add_subdirectory(llava)
|
||||
if (GGML_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
# these examples use the backends directly and cannot be built with dynamic loading
|
||||
if (GGML_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
endif()
|
||||
|
||||
@@ -35,23 +35,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
||||
|
||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
||||
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||
const struct llama_model * model = llama_get_model(ctx);
|
||||
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
// run model
|
||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
|
||||
// encoder-only model
|
||||
if (llama_encode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to encode\n", __func__);
|
||||
}
|
||||
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
||||
// decoder-only model
|
||||
if (llama_decode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to decode\n", __func__);
|
||||
}
|
||||
if (llama_encode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to encode\n", __func__);
|
||||
}
|
||||
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
@@ -89,6 +80,13 @@ int main(int argc, char ** argv) {
|
||||
common_init();
|
||||
|
||||
params.embedding = true;
|
||||
|
||||
// utilize the full context
|
||||
if (params.n_batch < params.n_ctx) {
|
||||
LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
|
||||
params.n_batch = params.n_ctx;
|
||||
}
|
||||
|
||||
// For non-causal models, batch size must be equal to ubatch size
|
||||
params.n_ubatch = params.n_batch;
|
||||
|
||||
@@ -134,7 +132,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// max batch size
|
||||
const uint64_t n_batch = params.n_batch;
|
||||
GGML_ASSERT(params.n_batch >= params.n_ctx);
|
||||
|
||||
// tokenize the prompts and trim
|
||||
std::vector<std::vector<int32_t>> inputs;
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
set(TARGET llama-gbnf-validator)
|
||||
add_executable(${TARGET} gbnf-validator.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
@@ -1,5 +0,0 @@
|
||||
set(TARGET llama-infill)
|
||||
add_executable(${TARGET} infill.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
@@ -1,47 +0,0 @@
|
||||
# llama.cpp/example/infill
|
||||
|
||||
This example shows how to use the infill mode with Code Llama models supporting infill mode.
|
||||
Currently the 7B and 13B models support infill mode.
|
||||
|
||||
Infill supports most of the options available in the main example.
|
||||
|
||||
For further information have a look at the main README.md in llama.cpp/example/main/README.md
|
||||
|
||||
## Common Options
|
||||
|
||||
In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models:
|
||||
|
||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
||||
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
||||
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
|
||||
- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
|
||||
|
||||
## Input Prompts
|
||||
|
||||
The `infill` program provides several ways to interact with the LLaMA models using input prompts:
|
||||
|
||||
- `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option.
|
||||
- `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option.
|
||||
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
|
||||
|
||||
## Interaction
|
||||
|
||||
The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
|
||||
|
||||
### Interaction Options
|
||||
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model.
|
||||
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
|
||||
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
|
||||
|
||||
### Example
|
||||
|
||||
Download a model that supports infill, for example CodeLlama:
|
||||
```console
|
||||
scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
|
||||
```
|
||||
|
||||
```bash
|
||||
./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
|
||||
```
|
||||
@@ -1,590 +0,0 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "console.h"
|
||||
#include "sampling.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
#elif defined (_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
static llama_context ** g_ctx;
|
||||
static llama_model ** g_model;
|
||||
static common_sampler ** g_smpl;
|
||||
static common_params * g_params;
|
||||
static std::vector<llama_token> * g_input_tokens;
|
||||
static std::ostringstream * g_output_ss;
|
||||
static std::vector<llama_token> * g_output_tokens;
|
||||
|
||||
static bool is_interacting = false;
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
static void sigint_handler(int signo) {
|
||||
if (signo == SIGINT) {
|
||||
if (!is_interacting) {
|
||||
is_interacting = true;
|
||||
} else {
|
||||
console::cleanup();
|
||||
LOG("\n");
|
||||
common_perf_print(*g_ctx, *g_smpl);
|
||||
|
||||
// make sure all logs are flushed
|
||||
LOG("Interrupted by user\n");
|
||||
common_log_pause(common_log_main());
|
||||
|
||||
_exit(130);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
g_params = ¶ms;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
auto & sparams = params.sampling;
|
||||
|
||||
console::init(params.simple_io, params.use_color);
|
||||
atexit([]() { console::cleanup(); });
|
||||
|
||||
if (params.logits_all) {
|
||||
LOG_ERR("\n************\n");
|
||||
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.embedding) {
|
||||
LOG_ERR("\n************\n");
|
||||
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
||||
LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
}
|
||||
|
||||
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
||||
LOG_ERR("\n************\n");
|
||||
LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.rope_freq_base != 0.0) {
|
||||
LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
||||
}
|
||||
|
||||
if (params.rope_freq_scale != 0.0) {
|
||||
LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||
}
|
||||
|
||||
LOG_INF("%s: llama backend init\n", __func__);
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model = nullptr;
|
||||
llama_context * ctx = nullptr;
|
||||
common_sampler * smpl = nullptr;
|
||||
|
||||
g_model = &model;
|
||||
g_ctx = &ctx;
|
||||
g_smpl = &smpl;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
|
||||
model = llama_init.model.get();
|
||||
ctx = llama_init.context.get();
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
const int n_ctx_train = llama_model_n_ctx_train(model);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
LOG_DBG("n_ctx: %d\n", n_ctx);
|
||||
|
||||
if (n_ctx > n_ctx_train) {
|
||||
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
}
|
||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
std::vector<llama_token> embd_end;
|
||||
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
||||
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
||||
|
||||
GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0);
|
||||
GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0);
|
||||
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
|
||||
|
||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||
if (add_bos) {
|
||||
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
||||
}
|
||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||
|
||||
const llama_token middle_token = llama_vocab_fim_mid(vocab);
|
||||
if (middle_token >= 0) {
|
||||
embd_inp.push_back(middle_token);
|
||||
}
|
||||
|
||||
LOG_DBG("add_bos: %d\n", add_bos);
|
||||
LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
|
||||
LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
|
||||
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
embd_inp.push_back(llama_vocab_bos(vocab));
|
||||
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
}
|
||||
|
||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// number of tokens to keep when resetting context
|
||||
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
|
||||
params.n_keep = (int)embd_inp.size();
|
||||
}
|
||||
|
||||
LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
|
||||
LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
|
||||
|
||||
// enable interactive mode if interactive start is specified
|
||||
if (params.interactive_first) {
|
||||
params.interactive = true;
|
||||
}
|
||||
|
||||
if (params.verbose_prompt) {
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
|
||||
if (params.n_keep > 0) {
|
||||
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
LOG_CNT("'\n");
|
||||
}
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
if (params.interactive) {
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = sigint_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
|
||||
LOG_INF("%s: interactive mode on.\n", __func__);
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG_INF("Input prefix with BOS\n");
|
||||
}
|
||||
|
||||
if (!params.input_prefix.empty()) {
|
||||
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
}
|
||||
|
||||
if (!params.input_suffix.empty()) {
|
||||
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
}
|
||||
}
|
||||
smpl = common_sampler_init(model, sparams);
|
||||
|
||||
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
|
||||
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
||||
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
|
||||
|
||||
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
|
||||
LOG_INF("\n");
|
||||
LOG_INF("\n##### Infill mode #####\n\n");
|
||||
if (params.interactive) {
|
||||
const char *control_message;
|
||||
if (params.multiline_input) {
|
||||
control_message = " - To return control to LLaMA, end your input with '\\'.\n"
|
||||
" - To return control without starting a new line, end your input with '/'.\n";
|
||||
} else {
|
||||
control_message = " - Press Return to return control to LLaMA.\n"
|
||||
" - To return control without starting a new line, end your input with '/'.\n"
|
||||
" - If you want to submit another line, end your input with '\\'.\n";
|
||||
}
|
||||
LOG_INF("== Running in interactive mode. ==\n");
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
|
||||
#endif
|
||||
LOG_INF( "%s\n", control_message);
|
||||
|
||||
is_interacting = params.interactive_first;
|
||||
}
|
||||
|
||||
bool input_echo = true;
|
||||
|
||||
int n_past = 0;
|
||||
int n_remain = params.n_predict;
|
||||
int n_consumed = 0;
|
||||
|
||||
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
||||
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
||||
std::ostringstream output_ss; g_output_ss = &output_ss;
|
||||
|
||||
// the first thing we will do is to output the prompt, so set color accordingly
|
||||
console::set_display(console::prompt);
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
|
||||
while (n_remain != 0 || params.interactive) {
|
||||
// predict
|
||||
if (!embd.empty()) {
|
||||
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
|
||||
// --prompt or --file which uses the same value.
|
||||
int max_embd_size = n_ctx - 4;
|
||||
|
||||
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
|
||||
if ((int) embd.size() > max_embd_size) {
|
||||
const int skipped_tokens = (int) embd.size() - max_embd_size;
|
||||
embd.resize(max_embd_size);
|
||||
|
||||
console::set_display(console::error);
|
||||
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||
console::set_display(console::reset);
|
||||
}
|
||||
|
||||
// infinite text generation via context swapping
|
||||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() > n_ctx) {
|
||||
if (params.n_predict == -2) {
|
||||
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
}
|
||||
|
||||
const int n_left = n_past - params.n_keep - 1;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
LOG_DBG("after swap: n_past = %d\n", n_past);
|
||||
|
||||
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
||||
|
||||
}
|
||||
|
||||
// evaluate tokens in batches
|
||||
// embd is typically prepared beforehand to fit within a batch, but not always
|
||||
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
||||
int n_eval = (int) embd.size() - i;
|
||||
if (n_eval > params.n_batch) {
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
|
||||
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
n_past += n_eval;
|
||||
|
||||
LOG_DBG("n_past = %d\n", n_past);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
embd.clear();
|
||||
|
||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||
const llama_token id = common_sampler_sample(smpl, ctx, -1);
|
||||
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
||||
|
||||
embd.push_back(id);
|
||||
|
||||
// echo this to console
|
||||
input_echo = true;
|
||||
|
||||
// decrement remaining sampling budget
|
||||
--n_remain;
|
||||
|
||||
LOG_DBG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
// some user input remains from prompt or interaction, forward it to processing
|
||||
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||
while ((int) embd_inp.size() > n_consumed) {
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
|
||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||
// for the prompt, we don't apply grammar rules
|
||||
common_sampler_accept(smpl, embd_inp[n_consumed], false);
|
||||
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params.n_batch) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// display text
|
||||
if (input_echo) {
|
||||
for (auto id : embd) {
|
||||
const std::string token_str = common_token_to_piece(ctx, id);
|
||||
LOG("%s", token_str.c_str());
|
||||
|
||||
if (embd.size() > 1) {
|
||||
input_tokens.push_back(id);
|
||||
} else {
|
||||
output_tokens.push_back(id);
|
||||
output_ss << token_str;
|
||||
}
|
||||
}
|
||||
}
|
||||
// reset color to default if we there is no pending user input
|
||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||
console::set_display(console::reset);
|
||||
}
|
||||
|
||||
// if not currently processing queued inputs;
|
||||
if ((int) embd_inp.size() <= n_consumed) {
|
||||
// deal with eot token in infill mode
|
||||
if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){
|
||||
if (is_interacting && !params.interactive_first) {
|
||||
// print an eot token
|
||||
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
|
||||
}
|
||||
LOG("\n");
|
||||
console::set_display(console::user_input);
|
||||
std::string buffer;
|
||||
std::string line;
|
||||
bool another_line=true;
|
||||
// set a new prefix via stdin
|
||||
do {
|
||||
another_line = console::readline(line, params.multiline_input);
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
// check if we got an empty line, if so we use the old input
|
||||
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
params.input_prefix = buffer;
|
||||
}
|
||||
buffer.clear();
|
||||
// set a new suffix via stdin
|
||||
do {
|
||||
another_line = console::readline(line, params.multiline_input);
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
// check if we got an empty line
|
||||
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
params.input_suffix = buffer;
|
||||
}
|
||||
buffer.clear();
|
||||
// done taking input, reset color
|
||||
console::set_display(console::reset);
|
||||
|
||||
if (params.escape) {
|
||||
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
|
||||
string_process_escapes(params.input_prefix);
|
||||
string_process_escapes(params.input_suffix);
|
||||
}
|
||||
|
||||
// tokenize new prefix and suffix
|
||||
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
||||
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
||||
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
|
||||
|
||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||
if (add_bos) {
|
||||
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
||||
}
|
||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||
|
||||
if (middle_token >= 0) {
|
||||
embd_inp.push_back(middle_token);
|
||||
}
|
||||
|
||||
embd.clear();
|
||||
n_remain = params.n_predict;
|
||||
n_past = 0;
|
||||
n_consumed = 0;
|
||||
is_interacting = false;
|
||||
}
|
||||
// deal with end of generation tokens in interactive mode
|
||||
else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
|
||||
LOG_DBG("found EOS token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
|
||||
is_interacting = true;
|
||||
LOG("\n");
|
||||
console::set_display(console::user_input);
|
||||
}
|
||||
}
|
||||
|
||||
if (n_past > 0 && is_interacting && !params.interactive) {
|
||||
LOG_DBG("waiting for user input\n");
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG_DBG("adding input prefix BOS token\n");
|
||||
embd_inp.push_back(llama_vocab_bos(vocab));
|
||||
}
|
||||
|
||||
std::string buffer;
|
||||
if (!params.input_prefix.empty()) {
|
||||
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
buffer += params.input_prefix;
|
||||
LOG("%s", buffer.c_str());
|
||||
}
|
||||
|
||||
std::string line;
|
||||
bool another_line = true;
|
||||
do {
|
||||
another_line = console::readline(line, params.multiline_input);
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
|
||||
// done taking input, reset color
|
||||
console::set_display(console::reset);
|
||||
|
||||
// Add tokens to embd only if the input buffer is non-empty
|
||||
// Entering a empty line lets the user pass control back
|
||||
if (buffer.length() > 1) {
|
||||
// append input suffix if any
|
||||
if (!params.input_suffix.empty()) {
|
||||
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
buffer += params.input_suffix;
|
||||
LOG("%s", params.input_suffix.c_str());
|
||||
}
|
||||
|
||||
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
||||
|
||||
const size_t original_size = embd_inp.size();
|
||||
|
||||
const auto line_inp = common_tokenize(ctx, buffer, false);
|
||||
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
||||
|
||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||
|
||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||
const llama_token token = embd_inp[i];
|
||||
output_tokens.push_back(token);
|
||||
output_ss << common_token_to_piece(ctx, token);
|
||||
}
|
||||
|
||||
n_remain -= line_inp.size();
|
||||
LOG_DBG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
LOG_DBG("empty line, passing control back\n");
|
||||
}
|
||||
|
||||
input_echo = false; // do not echo this again
|
||||
}
|
||||
|
||||
if (n_past > 0) {
|
||||
if (is_interacting) {
|
||||
common_sampler_reset(smpl);
|
||||
}
|
||||
is_interacting = false;
|
||||
}
|
||||
}
|
||||
|
||||
// end of generation
|
||||
if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) {
|
||||
break;
|
||||
}
|
||||
|
||||
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
||||
// We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
|
||||
if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
|
||||
n_remain = params.n_predict;
|
||||
is_interacting = true;
|
||||
}
|
||||
}
|
||||
if (!params.interactive && n_remain <= 0) {
|
||||
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
|
||||
}
|
||||
|
||||
LOG("\n");
|
||||
common_perf_print(ctx, smpl);
|
||||
|
||||
common_sampler_free(smpl);
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -10,6 +10,9 @@ from typing import Any, List, Optional, Set, Tuple, Union
|
||||
|
||||
def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
|
||||
|
||||
if max_items == 0:
|
||||
return ""
|
||||
|
||||
if min_items == 0 and max_items == 1:
|
||||
return f'{item_rule}?'
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,307 +0,0 @@
|
||||
import gguf
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
import torch
|
||||
import json
|
||||
import os
|
||||
import numpy as np
|
||||
from typing import cast, ContextManager, Any, Iterator
|
||||
from pathlib import Path
|
||||
from torch import Tensor
|
||||
|
||||
logger = logging.getLogger("gemma3-mmproj")
|
||||
|
||||
|
||||
# (copied from convert_hf_to_gguf.py)
|
||||
# tree of lazy tensors
|
||||
class LazyTorchTensor(gguf.LazyBase):
|
||||
_tensor_type = torch.Tensor
|
||||
# to keep the type-checker happy
|
||||
dtype: torch.dtype
|
||||
shape: torch.Size
|
||||
|
||||
# only used when converting a torch.Tensor to a np.ndarray
|
||||
_dtype_map: dict[torch.dtype, type] = {
|
||||
torch.float16: np.float16,
|
||||
torch.float32: np.float32,
|
||||
}
|
||||
|
||||
# used for safetensors slices
|
||||
# ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
|
||||
# TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
|
||||
_dtype_str_map: dict[str, torch.dtype] = {
|
||||
"F64": torch.float64,
|
||||
"F32": torch.float32,
|
||||
"BF16": torch.bfloat16,
|
||||
"F16": torch.float16,
|
||||
# "U64": torch.uint64,
|
||||
"I64": torch.int64,
|
||||
# "U32": torch.uint32,
|
||||
"I32": torch.int32,
|
||||
# "U16": torch.uint16,
|
||||
"I16": torch.int16,
|
||||
"U8": torch.uint8,
|
||||
"I8": torch.int8,
|
||||
"BOOL": torch.bool,
|
||||
"F8_E4M3": torch.float8_e4m3fn,
|
||||
"F8_E5M2": torch.float8_e5m2,
|
||||
}
|
||||
|
||||
def numpy(self) -> gguf.LazyNumpyTensor:
|
||||
dtype = self._dtype_map[self.dtype]
|
||||
return gguf.LazyNumpyTensor(
|
||||
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
||||
args=(self,),
|
||||
func=(lambda s: s.numpy())
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
|
||||
return torch.empty(size=shape, dtype=dtype, device="meta")
|
||||
|
||||
@classmethod
|
||||
def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
|
||||
dtype = cls._dtype_str_map[st_slice.get_dtype()]
|
||||
shape: tuple[int, ...] = tuple(st_slice.get_shape())
|
||||
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
|
||||
return cast(torch.Tensor, lazy)
|
||||
|
||||
@classmethod
|
||||
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
||||
del types # unused
|
||||
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
|
||||
if func is torch.Tensor.numpy:
|
||||
return args[0].numpy()
|
||||
|
||||
return cls._wrap_fn(func)(*args, **kwargs)
|
||||
|
||||
|
||||
class Gemma3VisionTower:
|
||||
hparams: dict
|
||||
gguf_writer: gguf.GGUFWriter
|
||||
fname_out: Path
|
||||
ftype: gguf.LlamaFileType
|
||||
|
||||
@staticmethod
|
||||
def load_hparams(dir_model: Path):
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
@staticmethod
|
||||
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
|
||||
part_names: list[str] = []
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith(prefix) and filename.endswith(suffix):
|
||||
part_names.append(filename)
|
||||
part_names.sort()
|
||||
return part_names
|
||||
|
||||
def __init__(self,
|
||||
dir_model: Path,
|
||||
fname_out: Path,
|
||||
ftype: gguf.LlamaFileType,
|
||||
is_big_endian: bool,):
|
||||
hparams = Gemma3VisionTower.load_hparams(dir_model)
|
||||
self.hparams = hparams
|
||||
self.fname_out = fname_out
|
||||
self.ftype = ftype
|
||||
endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||||
self.gguf_writer = gguf.GGUFWriter(path=None, arch="clip", endianess=endianess)
|
||||
|
||||
text_config = hparams["text_config"]
|
||||
vision_config = hparams["vision_config"]
|
||||
|
||||
assert hparams["architectures"][0] == "Gemma3ForConditionalGeneration"
|
||||
assert text_config is not None
|
||||
assert vision_config is not None
|
||||
|
||||
self.gguf_writer.add_string ("clip.projector_type", "gemma3")
|
||||
self.gguf_writer.add_bool ("clip.has_text_encoder", False)
|
||||
self.gguf_writer.add_bool ("clip.has_vision_encoder", True)
|
||||
self.gguf_writer.add_bool ("clip.has_llava_projector", False) # legacy
|
||||
self.gguf_writer.add_uint32 ("clip.vision.image_size", vision_config["image_size"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.patch_size", vision_config["patch_size"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.embedding_length", vision_config["hidden_size"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.feed_forward_length", vision_config["intermediate_size"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.projection_dim", text_config["hidden_size"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.block_count", vision_config["num_hidden_layers"])
|
||||
self.gguf_writer.add_uint32 ("clip.vision.attention.head_count", vision_config["num_attention_heads"])
|
||||
self.gguf_writer.add_float32("clip.vision.attention.layer_norm_epsilon", vision_config.get("layer_norm_eps", 1e-6))
|
||||
# default values taken from HF tranformers code
|
||||
self.gguf_writer.add_array ("clip.vision.image_mean", [0.5, 0.5, 0.5])
|
||||
self.gguf_writer.add_array ("clip.vision.image_std", [0.5, 0.5, 0.5])
|
||||
self.gguf_writer.add_bool ("clip.use_gelu", True)
|
||||
|
||||
# load tensors
|
||||
for name, data_torch in self.get_tensors(dir_model):
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
self.add_tensor(name, data_torch)
|
||||
|
||||
def get_tensors(self, dir_model: Path) -> Iterator[tuple[str, Tensor]]:
|
||||
part_names = Gemma3VisionTower.get_model_part_names(dir_model, "model", ".safetensors")
|
||||
tensor_names_from_parts: set[str] = set()
|
||||
for part_name in part_names:
|
||||
logger.info(f"gguf: loading model part '{part_name}'")
|
||||
from safetensors import safe_open
|
||||
ctx = cast(ContextManager[Any], safe_open(dir_model / part_name, framework="pt", device="cpu"))
|
||||
with ctx as model_part:
|
||||
tensor_names_from_parts.update(model_part.keys())
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part.get_slice(name)
|
||||
data = LazyTorchTensor.from_safetensors_slice(data)
|
||||
yield name, data
|
||||
|
||||
def add_tensor(self, name: str, data_torch: Tensor):
|
||||
is_1d = len(data_torch.shape) == 1
|
||||
is_embd = ".embeddings." in name
|
||||
old_dtype = data_torch.dtype
|
||||
can_quantize = not is_1d and not is_embd
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
|
||||
# this is to support old checkpoint
|
||||
# TODO: remove this when we have the final model
|
||||
name = name.replace("vision_model.vision_model.", "vision_tower.vision_model.")
|
||||
name = name.replace("multimodal_projector.", "multi_modal_projector.")
|
||||
|
||||
# filter only vision tensors
|
||||
if not name.startswith("vision_tower.vision_model.") and not name.startswith("multi_modal_projector."):
|
||||
return
|
||||
# prefix
|
||||
name = name.replace("vision_tower.vision_model.encoder.layers.", "v.blk.")
|
||||
name = name.replace("vision_tower.vision_model.", "v.")
|
||||
# projector and input embd
|
||||
name = name.replace(".embeddings.patch_embedding.", ".patch_embd.")
|
||||
name = name.replace(".embeddings.position_embedding.", ".position_embd.")
|
||||
name = name.replace(
|
||||
"multi_modal_projector.mm_input_projection_weight",
|
||||
"mm.input_projection.weight"
|
||||
)
|
||||
name = name.replace(
|
||||
"multi_modal_projector.mm_soft_emb_norm.weight",
|
||||
"mm.soft_emb_norm.weight"
|
||||
)
|
||||
name = name.replace("post_layernorm.", "post_ln.")
|
||||
# each block
|
||||
name = name.replace(".self_attn.k_proj.", ".attn_k.")
|
||||
name = name.replace(".self_attn.v_proj.", ".attn_v.")
|
||||
name = name.replace(".self_attn.q_proj.", ".attn_q.")
|
||||
name = name.replace(".self_attn.out_proj.", ".attn_out.")
|
||||
name = name.replace(".layer_norm1.", ".ln1.")
|
||||
name = name.replace(".layer_norm2.", ".ln2.")
|
||||
name = name.replace(".mlp.fc1.", ".ffn_down.")
|
||||
name = name.replace(".mlp.fc2.", ".ffn_up.")
|
||||
|
||||
if can_quantize:
|
||||
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {self.ftype}")
|
||||
|
||||
# corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
|
||||
# the other norm values are part of SigLIP model, and they are already correct
|
||||
# ref code: Gemma3RMSNorm
|
||||
if "soft_emb_norm.weight" in name:
|
||||
logger.info(f"Correcting norm value for '{name}'")
|
||||
data_torch = data_torch + 1
|
||||
|
||||
data = data_torch.numpy()
|
||||
|
||||
try:
|
||||
data = gguf.quants.quantize(data, data_qtype)
|
||||
except Exception as e:
|
||||
logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16")
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
data = gguf.quants.quantize(data, data_qtype)
|
||||
|
||||
# reverse shape to make it similar to the internal ggml dimension order
|
||||
shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}"
|
||||
logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||
|
||||
self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
|
||||
|
||||
def write(self):
|
||||
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
||||
self.gguf_writer.write_kv_data_to_file()
|
||||
self.gguf_writer.write_tensors_to_file(progress=True)
|
||||
self.gguf_writer.close()
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert Gemma 3 vision tower safetensors to GGUF format",)
|
||||
parser.add_argument(
|
||||
"--outfile", type=Path, default="mmproj.gguf",
|
||||
help="path to write to",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16",
|
||||
help="output format",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bigendian", action="store_true",
|
||||
help="model is executed on big endian machine",
|
||||
)
|
||||
parser.add_argument(
|
||||
"model", type=Path,
|
||||
help="directory containing model file",
|
||||
nargs="?",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose", action="store_true",
|
||||
help="increase output verbosity",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.model is None:
|
||||
parser.error("the following arguments are required: model")
|
||||
return args
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
dir_model = args.model
|
||||
|
||||
if not dir_model.is_dir():
|
||||
logger.error(f'Error: {args.model} is not a directory')
|
||||
sys.exit(1)
|
||||
|
||||
ftype_map: dict[str, gguf.LlamaFileType] = {
|
||||
"f32": gguf.LlamaFileType.ALL_F32,
|
||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||
}
|
||||
|
||||
logger.info(f"Loading model: {dir_model.name}")
|
||||
|
||||
with torch.inference_mode():
|
||||
gemma3_vision_tower = Gemma3VisionTower(
|
||||
dir_model=dir_model,
|
||||
fname_out=args.outfile,
|
||||
ftype=ftype_map[args.outtype],
|
||||
is_big_endian=args.bigendian,
|
||||
)
|
||||
gemma3_vision_tower.write()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
@@ -1,332 +0,0 @@
|
||||
#include "arg.h"
|
||||
#include "base64.hpp"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
||||
int N = (int) tokens.size();
|
||||
for (int i = 0; i < N; i += n_batch) {
|
||||
int n_eval = (int) tokens.size() - i;
|
||||
if (n_eval > n_batch) {
|
||||
n_eval = n_batch;
|
||||
}
|
||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
|
||||
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
return false;
|
||||
}
|
||||
*n_past += n_eval;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
||||
std::vector<llama_token> tokens;
|
||||
tokens.push_back(id);
|
||||
return eval_tokens(ctx_llama, tokens, 1, n_past);
|
||||
}
|
||||
|
||||
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
||||
std::string str2 = str;
|
||||
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
|
||||
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
||||
return true;
|
||||
}
|
||||
|
||||
static const char * sample(struct common_sampler * smpl,
|
||||
struct llama_context * ctx_llama,
|
||||
int * n_past) {
|
||||
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
const llama_model * model = llama_get_model(ctx_llama);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
static std::string ret;
|
||||
if (llama_vocab_is_eog(vocab, id)) {
|
||||
ret = "</s>";
|
||||
} else {
|
||||
ret = common_token_to_piece(ctx_llama, id);
|
||||
}
|
||||
eval_id(ctx_llama, id, n_past);
|
||||
return ret.c_str();
|
||||
}
|
||||
|
||||
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
|
||||
static const char* IMG_BASE64_TAG_END = "\">";
|
||||
|
||||
static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
|
||||
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
|
||||
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
|
||||
}
|
||||
|
||||
static bool prompt_contains_image(const std::string& prompt) {
|
||||
size_t begin, end;
|
||||
find_image_tag_in_prompt(prompt, begin, end);
|
||||
return (begin != std::string::npos);
|
||||
}
|
||||
|
||||
// replaces the base64 image tag in the prompt with `replacement`
|
||||
static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
|
||||
size_t img_base64_str_start, img_base64_str_end;
|
||||
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
||||
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
||||
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
|
||||
auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
|
||||
auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
|
||||
|
||||
auto required_bytes = base64::required_encode_size(base64_str.size());
|
||||
auto img_bytes = std::vector<unsigned char>(required_bytes);
|
||||
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
|
||||
|
||||
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
||||
if (!embed) {
|
||||
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return embed;
|
||||
}
|
||||
|
||||
static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
|
||||
size_t begin, end;
|
||||
find_image_tag_in_prompt(prompt, begin, end);
|
||||
if (begin == std::string::npos || end == std::string::npos) {
|
||||
return prompt;
|
||||
}
|
||||
auto pre = prompt.substr(0, begin);
|
||||
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
|
||||
return pre + replacement + post;
|
||||
}
|
||||
|
||||
struct llava_context {
|
||||
struct clip_ctx * ctx_clip = NULL;
|
||||
struct llama_context * ctx_llama = NULL;
|
||||
struct llama_model * model = NULL;
|
||||
};
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG("\n example usage:\n");
|
||||
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
|
||||
|
||||
// load and preprocess the image
|
||||
llava_image_embed * embed = NULL;
|
||||
auto prompt = params->prompt;
|
||||
if (prompt_contains_image(prompt)) {
|
||||
if (!params->image.empty()) {
|
||||
LOG_INF("using base64 encoded image instead of command line image path\n");
|
||||
}
|
||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
||||
if (!embed) {
|
||||
LOG_ERR("%s: can't load image from prompt\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
params->prompt = remove_image_from_prompt(prompt);
|
||||
} else {
|
||||
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||
if (!embed) {
|
||||
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return embed;
|
||||
}
|
||||
|
||||
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
|
||||
int n_past = 0;
|
||||
|
||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||
|
||||
std::string system_prompt, user_prompt;
|
||||
size_t image_pos = prompt.find("<image>");
|
||||
if (image_pos != std::string::npos) {
|
||||
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
||||
system_prompt = prompt.substr(0, image_pos);
|
||||
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
||||
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// llava-1.5 native mode
|
||||
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
|
||||
user_prompt = prompt + "\nASSISTANT:";
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
|
||||
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
|
||||
// generate the response
|
||||
|
||||
LOG("\n");
|
||||
|
||||
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
|
||||
if (!smpl) {
|
||||
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
std::string response = "";
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
LOG("%s", tmp);
|
||||
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
||||
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
||||
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
common_sampler_free(smpl);
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
static struct llama_model * llava_init(common_params * params) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params->numa);
|
||||
|
||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
||||
const char * clip_path = params->mmproj.path.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
|
||||
auto ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
|
||||
|
||||
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
||||
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
||||
|
||||
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
|
||||
|
||||
if (ctx_llama == NULL) {
|
||||
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||
|
||||
ctx_llava->ctx_llama = ctx_llama;
|
||||
ctx_llava->ctx_clip = ctx_clip;
|
||||
ctx_llava->model = model;
|
||||
return ctx_llava;
|
||||
}
|
||||
|
||||
static void llava_free(struct llava_context * ctx_llava) {
|
||||
if (ctx_llava->ctx_clip) {
|
||||
clip_free(ctx_llava->ctx_clip);
|
||||
ctx_llava->ctx_clip = NULL;
|
||||
}
|
||||
|
||||
llama_free(ctx_llava->ctx_llama);
|
||||
llama_model_free(ctx_llava->model);
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
common_params params;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto * model = llava_init(¶ms);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (prompt_contains_image(params.prompt)) {
|
||||
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
auto * image_embed = load_image(ctx_llava, ¶ms, "");
|
||||
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
} else {
|
||||
for (auto & image : params.image) {
|
||||
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
auto * image_embed = load_image(ctx_llava, ¶ms, image);
|
||||
if (!image_embed) {
|
||||
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
}
|
||||
}
|
||||
|
||||
llama_model_free(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,354 +0,0 @@
|
||||
#include "arg.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include <iostream> // TODO: remove me
|
||||
|
||||
struct llava_context {
|
||||
struct clip_ctx * ctx_clip = NULL;
|
||||
struct llama_context * ctx_llama = NULL;
|
||||
struct llama_model * model = NULL;
|
||||
};
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static struct llama_model * llava_init(common_params * params) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params->numa);
|
||||
|
||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
||||
if (params->n_ctx < 2048) {
|
||||
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
|
||||
LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
||||
ctx_params.n_ctx = 2048;
|
||||
} else {
|
||||
ctx_params.n_ctx = params->n_ctx;
|
||||
}
|
||||
|
||||
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
|
||||
|
||||
if (ctx_llama == NULL) {
|
||||
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||
|
||||
ctx_llava->ctx_llama = ctx_llama;
|
||||
ctx_llava->model = model;
|
||||
return ctx_llava;
|
||||
}
|
||||
|
||||
static void llava_free(struct llava_context * ctx_llava) {
|
||||
if (ctx_llava->ctx_clip) {
|
||||
clip_free(ctx_llava->ctx_clip);
|
||||
ctx_llava->ctx_clip = NULL;
|
||||
}
|
||||
|
||||
llama_free(ctx_llava->ctx_llama);
|
||||
llama_model_free(ctx_llava->model);
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
static struct clip_ctx * clip_init_context(common_params * params) {
|
||||
const char * clip_path = params->mmproj.path.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
struct clip_context_params clip_params = {
|
||||
/* use_gpu */ params->n_gpu_layers != 0,
|
||||
/* verbosity */ GGML_LOG_LEVEL_INFO, // TODO: make this configurable
|
||||
};
|
||||
auto * ctx_clip = clip_init(clip_path, clip_params);
|
||||
return ctx_clip;
|
||||
}
|
||||
|
||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
||||
int N = (int) tokens.size();
|
||||
for (int i = 0; i < N; i += n_batch) {
|
||||
int n_eval = (int) tokens.size() - i;
|
||||
if (n_eval > n_batch) {
|
||||
n_eval = n_batch;
|
||||
}
|
||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
|
||||
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
return false;
|
||||
}
|
||||
*n_past += n_eval;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
||||
std::vector<llama_token> tokens;
|
||||
tokens.push_back(id);
|
||||
return eval_tokens(ctx_llama, tokens, 1, n_past);
|
||||
}
|
||||
|
||||
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
||||
std::string str2 = str;
|
||||
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
|
||||
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
||||
}
|
||||
|
||||
static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
|
||||
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||
|
||||
auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
||||
slice_embed->embed = image_embed;
|
||||
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
|
||||
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
|
||||
llava_image_embed_free(slice_embed);
|
||||
}
|
||||
|
||||
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
|
||||
std::string system_prompt;
|
||||
int idx = 0;
|
||||
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
|
||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
||||
if (has_minicpmv_projector == 2) {
|
||||
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
system_prompt = "<|im_start|>user\n";
|
||||
}
|
||||
else if (has_minicpmv_projector == 4) {
|
||||
system_prompt = "<|im_start|>user\n";
|
||||
}
|
||||
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
if (num_image_embeds > 1) {
|
||||
if (has_minicpmv_projector == 2) {
|
||||
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
||||
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
||||
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
if (j == num_image_embeds_col - 1) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) {
|
||||
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
||||
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
||||
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||
if (j == num_image_embeds_col - 1) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||
}
|
||||
|
||||
static const char * sample(struct common_sampler * smpl,
|
||||
struct llama_context * ctx_llama,
|
||||
int * n_past) {
|
||||
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
const llama_model * model = llama_get_model(ctx_llama);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
static std::string ret;
|
||||
if (llama_vocab_is_eog(vocab, id)) {
|
||||
ret = "</s>";
|
||||
} else {
|
||||
ret = common_token_to_piece(ctx_llama, id);
|
||||
}
|
||||
eval_id(ctx_llama, id, n_past);
|
||||
return ret.c_str();
|
||||
}
|
||||
|
||||
static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){
|
||||
auto * ctx_clip = clip_init_context(params);
|
||||
auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||
if (!embeds) {
|
||||
LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// process the prompt
|
||||
if (params->prompt.empty() && params->interactive == false) {
|
||||
LOG_ERR("prompt should be given or interactive mode should be on");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto * model = llava_init(params);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
const int64_t t_llava_init_start_us = ggml_time_us();
|
||||
auto * ctx_llava = llava_init_context(params, model);
|
||||
ctx_llava->ctx_clip = ctx_clip;
|
||||
const int64_t t_llava_init_end_us = ggml_time_us();
|
||||
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
|
||||
LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
||||
|
||||
const int64_t t_process_image_start_us = ggml_time_us();
|
||||
process_image(ctx_llava, embeds, params, n_past);
|
||||
const int64_t t_process_image_end_us = ggml_time_us();
|
||||
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
|
||||
LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
||||
|
||||
llava_image_embed_free(embeds);
|
||||
return ctx_llava;
|
||||
}
|
||||
|
||||
static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){
|
||||
std::string user_prompt = prompt;
|
||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
||||
if (!is_first) {
|
||||
if (has_minicpmv_projector == 2) {
|
||||
user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
user_prompt = "<|im_start|>user\n" + prompt;
|
||||
}
|
||||
else if (has_minicpmv_projector == 4) {
|
||||
user_prompt = "<|im_start|>user\n" + prompt;
|
||||
}
|
||||
}
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
if (has_minicpmv_projector == 2) {
|
||||
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
||||
}
|
||||
else if (has_minicpmv_projector == 4) {
|
||||
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
||||
}
|
||||
|
||||
// generate the response
|
||||
|
||||
LOG_INF("\n");
|
||||
|
||||
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
|
||||
return smpl;
|
||||
}
|
||||
|
||||
static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){
|
||||
|
||||
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
||||
return tmp;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
common_params params;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.path.empty() || (params.image.empty())) {
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (auto & image : params.image) {
|
||||
int n_past = 0;
|
||||
auto * ctx_llava = minicpmv_init(¶ms, image, n_past);
|
||||
|
||||
if (!params.prompt.empty()) {
|
||||
LOG("<user>%s\n", params.prompt.c_str());
|
||||
LOG("<assistant>");
|
||||
auto * smpl = llama_init(ctx_llava, ¶ms, params.prompt, n_past, true);
|
||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||
std::string response;
|
||||
bool have_tmp = false;
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0){
|
||||
if (!have_tmp) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
have_tmp = true;
|
||||
printf("%s", tmp);
|
||||
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
common_sampler_free(smpl);
|
||||
}else {
|
||||
while (true) {
|
||||
LOG("<user>");
|
||||
std::string prompt;
|
||||
std::getline(std::cin, prompt);
|
||||
LOG("<assistant>");
|
||||
auto * smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||
std::string response;
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
printf("%s", tmp);// mistral llava-1.6
|
||||
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||
fflush(stdout);
|
||||
}
|
||||
common_sampler_free(smpl);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,341 +0,0 @@
|
||||
#include "clip.h"
|
||||
#include "clip-impl.h"
|
||||
#include "mtmd.h"
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cerrno>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
struct mtmd_context {
|
||||
struct clip_ctx * ctx_clip;
|
||||
const struct llama_model * text_model;
|
||||
std::vector<float> image_embd_v; // image embedding vector
|
||||
bool print_timings;
|
||||
int n_threads;
|
||||
std::string image_marker;
|
||||
|
||||
// TODO @ngxson : add timings
|
||||
|
||||
mtmd_context(const char * mmproj_fname,
|
||||
const llama_model * text_model,
|
||||
const mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
|
||||
clip_context_params ctx_clip_params;
|
||||
ctx_clip_params.use_gpu = ctx_params.use_gpu;
|
||||
ctx_clip_params.verbosity = ctx_params.verbosity;
|
||||
ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
|
||||
if (!ctx_clip) {
|
||||
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
|
||||
}
|
||||
this->text_model = text_model;
|
||||
}
|
||||
|
||||
~mtmd_context() {
|
||||
clip_free(ctx_clip);
|
||||
}
|
||||
};
|
||||
|
||||
struct mtmd_image_tokens_data {
|
||||
clip_image_f32_batch batch_f32; // preprocessed image patches
|
||||
};
|
||||
|
||||
struct mtmd_image_tokens {
|
||||
uint32_t nx; // number of tokens in x direction
|
||||
uint32_t ny; // number of tokens in y direction
|
||||
uint32_t n_tokens() const { return nx * ny; }
|
||||
clip_image_f32_batch batch_f32; // preprocessed image patches
|
||||
};
|
||||
|
||||
mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
||||
const struct llama_model * text_model,
|
||||
const struct mtmd_context_params ctx_params) {
|
||||
try {
|
||||
return new mtmd_context(mmproj_fname, text_model, ctx_params);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s: error: %s\n", __func__, e.what());
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void mtmd_free(mtmd_context * ctx) {
|
||||
if (ctx) {
|
||||
delete ctx;
|
||||
}
|
||||
}
|
||||
|
||||
// copied from common_tokenize
|
||||
static std::vector<llama_token> mtmd_tokenize_text_internal(
|
||||
const struct llama_vocab * vocab,
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
// upper limit for the number of tokens
|
||||
int n_tokens = text.length() + 2 * add_special;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
|
||||
const mtmd_input_text & text,
|
||||
const std::vector<mtmd_bitmap> & bitmaps) {
|
||||
mtmd_input_chunks * output = new mtmd_input_chunks;
|
||||
auto vocab = llama_model_get_vocab(ctx->text_model);
|
||||
|
||||
std::string prompt_modified(text.text);
|
||||
std::string marker_modified(ctx->image_marker);
|
||||
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
||||
// a bit hacky here, but works for now
|
||||
// for some models, we need to add prefix and suffix to the image embeddings
|
||||
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
||||
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
|
||||
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
||||
}
|
||||
|
||||
std::vector<std::string> parts = string_split_str(text.text, ctx->image_marker);
|
||||
output->clear();
|
||||
output->reserve(parts.size());
|
||||
|
||||
size_t i_img = 0;
|
||||
|
||||
for (const auto & part : parts) {
|
||||
//printf("tokenizing part: %s\n", part.c_str());
|
||||
bool add_bos = &parts.front() == ∂
|
||||
auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
|
||||
if (tokens.empty()) {
|
||||
continue;
|
||||
}
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
std::move(tokens),
|
||||
{},
|
||||
};
|
||||
output->emplace_back(std::move(chunk));
|
||||
|
||||
if (&parts.back() != &part) {
|
||||
// add image token to middle of 2 parts
|
||||
|
||||
if (i_img >= bitmaps.size()) {
|
||||
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// shim layer
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
img_u8->nx = bitmaps[i_img].nx;
|
||||
img_u8->ny = bitmaps[i_img].ny;
|
||||
img_u8->buf.resize(bitmaps[i_img].data.size());
|
||||
std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
|
||||
|
||||
// preprocess image
|
||||
clip_image_f32_batch batch_f32;
|
||||
bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess image\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
mtmd_image_tokens * image_tokens = new mtmd_image_tokens;
|
||||
image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
|
||||
image_tokens->ny = 1; // TODO
|
||||
image_tokens->batch_f32 = std::move(batch_f32);
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
{},
|
||||
image_tokens,
|
||||
};
|
||||
output->emplace_back(std::move(chunk));
|
||||
i_img++;
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
|
||||
for (auto & chunk : *chunks) {
|
||||
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) {
|
||||
delete chunk.tokens_image;
|
||||
}
|
||||
}
|
||||
delete chunks;
|
||||
}
|
||||
|
||||
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
||||
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
||||
bool ok = clip_image_batch_encode(
|
||||
ctx->ctx_clip,
|
||||
ctx->n_threads,
|
||||
&image_tokens->batch_f32,
|
||||
ctx->image_embd_v.data());
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
float * mtmd_get_output_embd(mtmd_context * ctx) {
|
||||
return ctx->image_embd_v.data();
|
||||
}
|
||||
|
||||
size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks) {
|
||||
size_t n_tokens = 0;
|
||||
for (auto & chunk : *chunks) {
|
||||
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
n_tokens += chunk.tokens_text.size();
|
||||
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
n_tokens += chunk.tokens_image->n_tokens();
|
||||
} else {
|
||||
GGML_ASSERT(false && "chunk type not supported");
|
||||
}
|
||||
}
|
||||
return n_tokens;
|
||||
}
|
||||
|
||||
// helper struct to make working with embd batch easier
|
||||
// note: this will be removed after llama_batch_ext refactoring
|
||||
struct decode_embd_batch {
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id> seq_id_0;
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
logits .resize(n_tokens);
|
||||
seq_id_0.resize(1);
|
||||
seq_id_0[0] = seq_id;
|
||||
seq_ids [n_tokens] = nullptr;
|
||||
batch = {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
/*logits =*/ logits.data(),
|
||||
};
|
||||
for (int i = 0; i < n_tokens; i++) {
|
||||
batch.pos [i] = pos_0 + i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
llama_context * lctx,
|
||||
mtmd_input_chunks * chunks,
|
||||
llama_pos pos0,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch) {
|
||||
int32_t ret;
|
||||
llama_pos n_past = pos0;
|
||||
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
|
||||
|
||||
for (auto & chunk : *chunks) {
|
||||
bool is_last = &chunk == &chunks->back();
|
||||
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
// TODO @ngxson : may need to split into smaller batches
|
||||
text_batch.n_tokens = chunk.tokens_text.size();
|
||||
for (size_t i = 0; i < chunk.tokens_text.size(); i++) {
|
||||
text_batch.token [i] = chunk.tokens_text[i];
|
||||
text_batch.pos [i] = n_past++;
|
||||
text_batch.n_seq_id[i] = 1;
|
||||
text_batch.seq_id [i][0] = seq_id;
|
||||
text_batch.logits [i] = false;
|
||||
}
|
||||
if (is_last) {
|
||||
// always get logits for last input chunk
|
||||
text_batch.logits[text_batch.n_tokens - 1] = true;
|
||||
}
|
||||
ret = llama_decode(lctx, text_batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode text\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
|
||||
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
GGML_ASSERT(!is_last && "logits for last image chunk is not yet support");
|
||||
GGML_ASSERT(chunk.tokens_image != nullptr);
|
||||
int64_t t0 = ggml_time_ms();
|
||||
if (ctx->print_timings) {
|
||||
LOG_INF("encoding image...\n");
|
||||
}
|
||||
ret = mtmd_encode(ctx, chunk.tokens_image);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to encode image\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
if (ctx->print_timings) {
|
||||
LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
||||
}
|
||||
|
||||
int32_t n_tokens = chunk.tokens_image->n_tokens();
|
||||
float * embd = mtmd_get_output_embd(ctx);
|
||||
decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
|
||||
int64_t t1 = ggml_time_ms();
|
||||
ret = llama_decode(lctx, batch_img.batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode image\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
if (ctx->print_timings) {
|
||||
LOG_INF("image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
|
||||
}
|
||||
|
||||
n_past += n_tokens;
|
||||
|
||||
} else {
|
||||
GGML_ASSERT(false && "chunk type not supported");
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch_free(text_batch);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output) {
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to load image from buffer\n");
|
||||
return 1;
|
||||
}
|
||||
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
|
||||
output.data.resize(output.nx * output.ny * 3);
|
||||
std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) {
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
bool ok = clip_image_load_from_file(fname, img_u8.get());
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to load image %s\n", fname);
|
||||
return 1;
|
||||
}
|
||||
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
|
||||
output.data.resize(output.nx * output.ny * 3);
|
||||
std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
|
||||
return 0;
|
||||
}
|
||||
@@ -1,146 +0,0 @@
|
||||
#ifndef MTMD_H
|
||||
#define MTMD_H
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "clip.h"
|
||||
|
||||
#include <vector>
|
||||
#include <cinttypes>
|
||||
#include <memory>
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef LLAMA_BUILD
|
||||
# define MTMD_API __declspec(dllexport)
|
||||
# else
|
||||
# define MTMD_API __declspec(dllimport)
|
||||
# endif
|
||||
# else
|
||||
# define MTMD_API __attribute__ ((visibility ("default")))
|
||||
# endif
|
||||
#else
|
||||
# define MTMD_API
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
enum mtmd_input_chunk_type {
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
};
|
||||
|
||||
struct mtmd_context;
|
||||
struct mtmd_image_tokens;
|
||||
|
||||
// represents raw image data, layout is RGBRGBRGB...
|
||||
// length of data must be nx * ny * 3
|
||||
struct mtmd_bitmap {
|
||||
uint32_t nx;
|
||||
uint32_t ny;
|
||||
std::vector<unsigned char> data;
|
||||
};
|
||||
|
||||
struct mtmd_input_chunk {
|
||||
mtmd_input_chunk_type type;
|
||||
std::vector<llama_token> tokens_text;
|
||||
mtmd_image_tokens * tokens_image = nullptr;
|
||||
};
|
||||
|
||||
using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
|
||||
|
||||
struct mtmd_context_params {
|
||||
bool use_gpu = true;
|
||||
bool print_timings = true;
|
||||
int n_threads = 4;
|
||||
enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
|
||||
const char * image_marker = "<__image__>";
|
||||
};
|
||||
|
||||
struct mtmd_input_text {
|
||||
std::string text;
|
||||
bool add_special;
|
||||
bool parse_special;
|
||||
};
|
||||
|
||||
// initialize the mtmd context
|
||||
// return nullptr on failure
|
||||
MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
||||
const llama_model * text_model,
|
||||
const mtmd_context_params ctx_params);
|
||||
|
||||
MTMD_API void mtmd_free(mtmd_context * ctx);
|
||||
|
||||
// tokenize an input text prompt and an image
|
||||
// the prompt must have the input image marker (default: "<__image__>") in it
|
||||
// the marker will be replaced with the image tokens
|
||||
// for example:
|
||||
// "here is an image: <__image__>\ndescribe it in detail."
|
||||
// this will gives 3 chunks:
|
||||
// 1. "here is an image: <start_of_image>"
|
||||
// 2. (image tokens)
|
||||
// 3. "<end_of_image>\ndescribe it in detail."
|
||||
// number of bitmaps must be equal to the number of image markers in the prompt
|
||||
// this function is thread-safe (shared ctx)
|
||||
MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
|
||||
const mtmd_input_text & text,
|
||||
const std::vector<mtmd_bitmap> & bitmaps);
|
||||
|
||||
// free image chunk data
|
||||
MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
|
||||
|
||||
// returns 0 on success
|
||||
MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
|
||||
const mtmd_image_tokens * image_tokens);
|
||||
|
||||
// get output embeddings from the last encode pass
|
||||
MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
||||
|
||||
//
|
||||
// helper functions (can be implemented based on other functions)
|
||||
//
|
||||
|
||||
// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
|
||||
MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks);
|
||||
|
||||
// helper function that automatically:
|
||||
// 1. run llama_decode() on text chunks
|
||||
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
||||
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
|
||||
// otherwise, returns 0 on success
|
||||
MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
llama_context * lctx,
|
||||
mtmd_input_chunks * chunks,
|
||||
llama_pos pos0,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch);
|
||||
|
||||
// helper function to construct a mtmd_bitmap from a file
|
||||
// returns 0 on success
|
||||
// this function is thread-safe
|
||||
MTMD_API int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
|
||||
|
||||
// helper function to construct a mtmd_bitmap from a buffer
|
||||
// the buffer must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
|
||||
// returns 0 on success
|
||||
// this function is thread-safe
|
||||
MTMD_API int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output);
|
||||
|
||||
// convenient unique_ptr wrappers
|
||||
struct mtmd_context_deleter {
|
||||
void operator()(mtmd_context * val) { mtmd_free(val); }
|
||||
};
|
||||
using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
|
||||
|
||||
struct mtmd_input_chunks_deleter {
|
||||
void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
|
||||
};
|
||||
using mtmd_input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
|
||||
|
||||
#else
|
||||
|
||||
static_assert(false && "C header is not yet supported by this library");
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,165 +0,0 @@
|
||||
import argparse
|
||||
from typing import Dict
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from gguf import *
|
||||
from transformers import (
|
||||
Qwen2VLForConditionalGeneration,
|
||||
Qwen2VLProcessor,
|
||||
AutoProcessor,
|
||||
Qwen2VLConfig
|
||||
)
|
||||
|
||||
|
||||
VISION = "clip.vision"
|
||||
|
||||
|
||||
def k(raw_key: str, arch: str) -> str:
|
||||
return raw_key.format(arch=arch)
|
||||
|
||||
|
||||
def to_gguf_name(name: str) -> str:
|
||||
og = name
|
||||
name = name.replace("text_model", "t").replace("vision_model", "v")
|
||||
name = name.replace("blocks", "blk").replace("embeddings.", "")
|
||||
name = name.replace("attn.", "attn_")
|
||||
name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
|
||||
# name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
|
||||
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
|
||||
name = name.replace("merger.mlp", 'mm')
|
||||
print(f"[to_gguf_name] {og} --> {name}")
|
||||
return name
|
||||
|
||||
|
||||
def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
|
||||
vision_model = qwen2vl.visual
|
||||
tensor_map = {}
|
||||
for name, ten in vision_model.state_dict().items():
|
||||
ten = ten.numpy()
|
||||
if 'qkv' in name:
|
||||
if ten.ndim == 2: # weight
|
||||
c3, _ = ten.shape
|
||||
else: # bias
|
||||
c3 = ten.shape[0]
|
||||
assert c3 % 3 == 0
|
||||
c = c3 // 3
|
||||
wq = ten[:c]
|
||||
wk = ten[c: c * 2]
|
||||
wv = ten[c * 2:]
|
||||
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
|
||||
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
|
||||
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
|
||||
elif 'merger' in name:
|
||||
if name.endswith("ln_q.weight"):
|
||||
tensor_map['v.post_ln.weight'] = ten
|
||||
elif name.endswith("ln_q.bias"):
|
||||
tensor_map['v.post_ln.bias'] = ten
|
||||
else:
|
||||
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
|
||||
tensor_map[to_gguf_name(name)] = ten
|
||||
elif 'patch_embed.proj.weight' in name:
|
||||
# NOTE: split Conv3D into Conv2Ds
|
||||
c1, c2, kt, kh, kw = ten.shape
|
||||
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
|
||||
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
|
||||
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
|
||||
else:
|
||||
tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
|
||||
|
||||
for new_name, ten in tensor_map.items():
|
||||
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
|
||||
tensor_map[new_name] = ten.astype(np.float32)
|
||||
else:
|
||||
tensor_map[new_name] = ten.astype(dtype)
|
||||
tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder
|
||||
return tensor_map
|
||||
|
||||
|
||||
def main(args):
|
||||
if args.data_type == 'fp32':
|
||||
dtype = torch.float32
|
||||
np_dtype = np.float32
|
||||
ftype = 0
|
||||
elif args.data_type == 'fp16':
|
||||
dtype = torch.float32
|
||||
np_dtype = np.float16
|
||||
ftype = 1
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
local_model = False
|
||||
model_path = ""
|
||||
model_name = args.model_name
|
||||
print("model_name: ", model_name)
|
||||
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
|
||||
model_name, torch_dtype=dtype, device_map="cpu"
|
||||
)
|
||||
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
|
||||
vcfg = cfg.vision_config
|
||||
|
||||
if os.path.isdir(model_name):
|
||||
local_model = True
|
||||
if model_name.endswith(os.sep):
|
||||
model_name = model_name[:-1]
|
||||
model_path = model_name
|
||||
model_name = os.path.basename(model_name)
|
||||
fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
|
||||
|
||||
fout = GGUFWriter(path=fname_out, arch="clip")
|
||||
fout.add_description("image encoder for Qwen2VL")
|
||||
|
||||
fout.add_file_type(ftype)
|
||||
fout.add_bool("clip.has_text_encoder", False)
|
||||
fout.add_bool("clip.has_vision_encoder", True)
|
||||
fout.add_bool("clip.has_qwen2vl_merger", True)
|
||||
fout.add_string("clip.projector_type", "qwen2vl_merger")
|
||||
|
||||
print(cfg.vision_config)
|
||||
if 'silu' in cfg.vision_config.hidden_act.lower():
|
||||
fout.add_bool("clip.use_silu", True)
|
||||
fout.add_bool("clip.use_gelu", False)
|
||||
elif 'gelu' in cfg.vision_config.hidden_act.lower():
|
||||
fout.add_bool("clip.use_silu", False)
|
||||
fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower())
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
tensor_map = find_vision_tensors(qwen2vl, np_dtype)
|
||||
for name, data in tensor_map.items():
|
||||
fout.add_tensor(name, data)
|
||||
|
||||
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
|
||||
fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2)
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
|
||||
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
|
||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
|
||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
|
||||
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0) # not sure what this does, put 0 here as a placeholder
|
||||
fout.add_name(model_name)
|
||||
"""
|
||||
HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
|
||||
it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
|
||||
"""
|
||||
|
||||
if local_model:
|
||||
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path)
|
||||
else:
|
||||
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
|
||||
fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]
|
||||
fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]
|
||||
|
||||
fout.write_header_to_file()
|
||||
fout.write_kv_data_to_file()
|
||||
fout.write_tensors_to_file()
|
||||
fout.close()
|
||||
print("save model as: ", fname_out)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
|
||||
parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@@ -1,81 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# make sure we are in the right directory
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd $SCRIPT_DIR
|
||||
|
||||
#export LLAMA_CACHE="$SCRIPT_DIR/tmp"
|
||||
|
||||
set -eux
|
||||
|
||||
mkdir -p $SCRIPT_DIR/output
|
||||
|
||||
PROJ_ROOT="$SCRIPT_DIR/../.."
|
||||
cd $PROJ_ROOT
|
||||
|
||||
###############
|
||||
|
||||
arr_bin=()
|
||||
arr_hf=()
|
||||
|
||||
add_test() {
|
||||
local bin=$1
|
||||
local hf=$2
|
||||
arr_bin+=("$bin")
|
||||
arr_hf+=("$hf")
|
||||
}
|
||||
|
||||
add_test "llama-gemma3-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||
add_test "llama-llava-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
|
||||
add_test "llama-llava-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M"
|
||||
add_test "llama-llava-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
||||
add_test "llama-llava-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K"
|
||||
add_test "llama-llava-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K"
|
||||
add_test "llama-llava-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||
add_test "llama-minicpmv-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||
add_test "llama-minicpmv-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
add_test "llama-minicpmv-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
||||
add_test "llama-qwen2vl-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
|
||||
###############
|
||||
|
||||
cmake --build build -j --target "${arr_bin[@]}"
|
||||
|
||||
arr_res=()
|
||||
|
||||
for i in "${!arr_bin[@]}"; do
|
||||
bin="${arr_bin[$i]}"
|
||||
hf="${arr_hf[$i]}"
|
||||
|
||||
echo "Running test with binary: $bin and HF model: $hf"
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
output=$("$PROJ_ROOT/build/bin/$bin" -hf "$hf" --image $SCRIPT_DIR/test-1.jpeg -p "what is the publisher name of the newspaper?" --temp 0 2>&1 | tee /dev/tty)
|
||||
|
||||
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
|
||||
|
||||
if echo "$output" | grep -iq "new york"; then
|
||||
result="\033[32mOK\033[0m: $bin $hf"
|
||||
else
|
||||
result="\033[31mFAIL\033[0m: $bin $hf"
|
||||
fi
|
||||
echo -e "$result"
|
||||
arr_res+=("$result")
|
||||
|
||||
echo ""
|
||||
echo ""
|
||||
echo ""
|
||||
echo "#################################################"
|
||||
echo "#################################################"
|
||||
echo ""
|
||||
echo ""
|
||||
done
|
||||
|
||||
set +x
|
||||
|
||||
for i in "${!arr_res[@]}"; do
|
||||
echo -e "${arr_res[$i]}"
|
||||
done
|
||||
echo ""
|
||||
echo "Output logs are saved in $SCRIPT_DIR/output"
|
||||
@@ -23,7 +23,7 @@ def create_completion(host, prompt, gbnf_grammar):
|
||||
"""Calls the /completion API on llama-server.
|
||||
|
||||
See
|
||||
https://github.com/ggml-org/llama.cpp/tree/HEAD/examples/server#api-endpoints
|
||||
https://github.com/ggml-org/llama.cpp/tree/HEAD/tools/server#api-endpoints
|
||||
"""
|
||||
print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}")
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
set(TARGET llama-quantize-stats)
|
||||
add_executable(${TARGET} quantize-stats.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
Binary file not shown.
@@ -1,178 +0,0 @@
|
||||
import { useEffect, useState } from 'react';
|
||||
import StorageUtils from '../utils/storage';
|
||||
import { useAppContext } from '../utils/app.context';
|
||||
import { classNames } from '../utils/misc';
|
||||
import daisyuiThemes from 'daisyui/theme/object';
|
||||
import { THEMES } from '../Config';
|
||||
import { useNavigate } from 'react-router';
|
||||
|
||||
export default function Header() {
|
||||
const navigate = useNavigate();
|
||||
const [selectedTheme, setSelectedTheme] = useState(StorageUtils.getTheme());
|
||||
const { setShowSettings } = useAppContext();
|
||||
|
||||
const setTheme = (theme: string) => {
|
||||
StorageUtils.setTheme(theme);
|
||||
setSelectedTheme(theme);
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
document.body.setAttribute('data-theme', selectedTheme);
|
||||
document.body.setAttribute(
|
||||
'data-color-scheme',
|
||||
daisyuiThemes[selectedTheme]?.['color-scheme'] ?? 'auto'
|
||||
);
|
||||
}, [selectedTheme]);
|
||||
|
||||
const { isGenerating, viewingChat } = useAppContext();
|
||||
const isCurrConvGenerating = isGenerating(viewingChat?.conv.id ?? '');
|
||||
|
||||
const removeConversation = () => {
|
||||
if (isCurrConvGenerating || !viewingChat) return;
|
||||
const convId = viewingChat?.conv.id;
|
||||
if (window.confirm('Are you sure to delete this conversation?')) {
|
||||
StorageUtils.remove(convId);
|
||||
navigate('/');
|
||||
}
|
||||
};
|
||||
|
||||
const downloadConversation = () => {
|
||||
if (isCurrConvGenerating || !viewingChat) return;
|
||||
const convId = viewingChat?.conv.id;
|
||||
const conversationJson = JSON.stringify(viewingChat, null, 2);
|
||||
const blob = new Blob([conversationJson], { type: 'application/json' });
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = `conversation_${convId}.json`;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="flex flex-row items-center pt-6 pb-6 sticky top-0 z-10 bg-base-100">
|
||||
{/* open sidebar button */}
|
||||
<label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
|
||||
<svg
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
width="16"
|
||||
height="16"
|
||||
fill="currentColor"
|
||||
className="bi bi-list"
|
||||
viewBox="0 0 16 16"
|
||||
>
|
||||
<path
|
||||
fillRule="evenodd"
|
||||
d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"
|
||||
/>
|
||||
</svg>
|
||||
</label>
|
||||
|
||||
<div className="grow text-2xl font-bold ml-2">llama.cpp</div>
|
||||
|
||||
{/* action buttons (top right) */}
|
||||
<div className="flex items-center">
|
||||
{viewingChat && (
|
||||
<div className="dropdown dropdown-end">
|
||||
{/* "..." button */}
|
||||
<button
|
||||
tabIndex={0}
|
||||
role="button"
|
||||
className="btn m-1"
|
||||
disabled={isCurrConvGenerating}
|
||||
>
|
||||
<svg
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
width="16"
|
||||
height="16"
|
||||
fill="currentColor"
|
||||
className="bi bi-three-dots-vertical"
|
||||
viewBox="0 0 16 16"
|
||||
>
|
||||
<path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0" />
|
||||
</svg>
|
||||
</button>
|
||||
{/* dropdown menu */}
|
||||
<ul
|
||||
tabIndex={0}
|
||||
className="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow"
|
||||
>
|
||||
<li onClick={downloadConversation}>
|
||||
<a>Download</a>
|
||||
</li>
|
||||
<li className="text-error" onClick={removeConversation}>
|
||||
<a>Delete</a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
)}
|
||||
|
||||
<div className="tooltip tooltip-bottom" data-tip="Settings">
|
||||
<button className="btn" onClick={() => setShowSettings(true)}>
|
||||
{/* settings button */}
|
||||
<svg
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
width="16"
|
||||
height="16"
|
||||
fill="currentColor"
|
||||
className="bi bi-gear"
|
||||
viewBox="0 0 16 16"
|
||||
>
|
||||
<path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0" />
|
||||
<path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z" />
|
||||
</svg>
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{/* theme controller is copied from https://daisyui.com/components/theme-controller/ */}
|
||||
<div className="tooltip tooltip-bottom" data-tip="Themes">
|
||||
<div className="dropdown dropdown-end dropdown-bottom">
|
||||
<div tabIndex={0} role="button" className="btn m-1">
|
||||
<svg
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
width="16"
|
||||
height="16"
|
||||
fill="currentColor"
|
||||
className="bi bi-palette2"
|
||||
viewBox="0 0 16 16"
|
||||
>
|
||||
<path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z" />
|
||||
</svg>
|
||||
</div>
|
||||
<ul
|
||||
tabIndex={0}
|
||||
className="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto"
|
||||
>
|
||||
<li>
|
||||
<button
|
||||
className={classNames({
|
||||
'btn btn-sm btn-block btn-ghost justify-start': true,
|
||||
'btn-active': selectedTheme === 'auto',
|
||||
})}
|
||||
onClick={() => setTheme('auto')}
|
||||
>
|
||||
auto
|
||||
</button>
|
||||
</li>
|
||||
{THEMES.map((theme) => (
|
||||
<li key={theme}>
|
||||
<input
|
||||
type="radio"
|
||||
name="theme-dropdown"
|
||||
className="theme-controller btn btn-sm btn-block btn-ghost justify-start"
|
||||
aria-label={theme}
|
||||
value={theme}
|
||||
checked={selectedTheme === theme}
|
||||
onChange={(e) => e.target.checked && setTheme(theme)}
|
||||
/>
|
||||
</li>
|
||||
))}
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@@ -1,96 +0,0 @@
|
||||
import { useEffect, useState } from 'react';
|
||||
import { classNames } from '../utils/misc';
|
||||
import { Conversation } from '../utils/types';
|
||||
import StorageUtils from '../utils/storage';
|
||||
import { useNavigate, useParams } from 'react-router';
|
||||
|
||||
export default function Sidebar() {
|
||||
const params = useParams();
|
||||
const navigate = useNavigate();
|
||||
|
||||
const [conversations, setConversations] = useState<Conversation[]>([]);
|
||||
const [currConv, setCurrConv] = useState<Conversation | null>(null);
|
||||
|
||||
useEffect(() => {
|
||||
StorageUtils.getOneConversation(params.convId ?? '').then(setCurrConv);
|
||||
}, [params.convId]);
|
||||
|
||||
useEffect(() => {
|
||||
const handleConversationChange = async () => {
|
||||
setConversations(await StorageUtils.getAllConversations());
|
||||
};
|
||||
StorageUtils.onConversationChanged(handleConversationChange);
|
||||
handleConversationChange();
|
||||
return () => {
|
||||
StorageUtils.offConversationChanged(handleConversationChange);
|
||||
};
|
||||
}, []);
|
||||
|
||||
return (
|
||||
<>
|
||||
<input
|
||||
id="toggle-drawer"
|
||||
type="checkbox"
|
||||
className="drawer-toggle"
|
||||
defaultChecked
|
||||
/>
|
||||
|
||||
<div className="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
|
||||
<label
|
||||
htmlFor="toggle-drawer"
|
||||
aria-label="close sidebar"
|
||||
className="drawer-overlay"
|
||||
></label>
|
||||
<div className="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
|
||||
<div className="flex flex-row items-center justify-between mb-4 mt-4">
|
||||
<h2 className="font-bold ml-4">Conversations</h2>
|
||||
|
||||
{/* close sidebar button */}
|
||||
<label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
|
||||
<svg
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
width="16"
|
||||
height="16"
|
||||
fill="currentColor"
|
||||
className="bi bi-arrow-bar-left"
|
||||
viewBox="0 0 16 16"
|
||||
>
|
||||
<path
|
||||
fillRule="evenodd"
|
||||
d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"
|
||||
/>
|
||||
</svg>
|
||||
</label>
|
||||
</div>
|
||||
|
||||
{/* list of conversations */}
|
||||
<div
|
||||
className={classNames({
|
||||
'btn btn-ghost justify-start': true,
|
||||
'btn-active': !currConv,
|
||||
})}
|
||||
onClick={() => navigate('/')}
|
||||
>
|
||||
+ New conversation
|
||||
</div>
|
||||
{conversations.map((conv) => (
|
||||
<div
|
||||
key={conv.id}
|
||||
className={classNames({
|
||||
'btn btn-ghost justify-start font-normal': true,
|
||||
'btn-active': conv.id === currConv?.id,
|
||||
})}
|
||||
onClick={() => navigate(`/chat/${conv.id}`)}
|
||||
dir="auto"
|
||||
>
|
||||
<span className="truncate">{conv.name}</span>
|
||||
</div>
|
||||
))}
|
||||
<div className="text-center text-xs opacity-40 mt-auto mx-4">
|
||||
Conversations are saved to browser's IndexedDB
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</>
|
||||
);
|
||||
}
|
||||
@@ -107,6 +107,7 @@ message(DEBUG "INS_ENB : ${INS_ENB}")
|
||||
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
||||
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
||||
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
|
||||
option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
|
||||
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
||||
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
|
||||
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
||||
@@ -359,3 +360,29 @@ write_basic_package_version_file(
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
|
||||
|
||||
if (MSVC)
|
||||
set(MSVC_WARNING_FLAGS
|
||||
/wd4005 # Macro redefinition
|
||||
/wd4244 # Conversion from one type to another type, possible loss of data
|
||||
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
|
||||
/wd4996 # Disable POSIX deprecation warnings
|
||||
/wd4702 # Unreachable code warnings
|
||||
)
|
||||
function(disable_msvc_warnings target_name)
|
||||
if(TARGET ${target_name})
|
||||
target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
disable_msvc_warnings(ggml-base)
|
||||
disable_msvc_warnings(ggml)
|
||||
disable_msvc_warnings(ggml-cpu)
|
||||
disable_msvc_warnings(ggml-cpu-x64)
|
||||
disable_msvc_warnings(ggml-cpu-sse42)
|
||||
disable_msvc_warnings(ggml-cpu-sandybridge)
|
||||
disable_msvc_warnings(ggml-cpu-haswell)
|
||||
disable_msvc_warnings(ggml-cpu-skylakex)
|
||||
disable_msvc_warnings(ggml-cpu-icelake)
|
||||
disable_msvc_warnings(ggml-cpu-alderlake)
|
||||
endif()
|
||||
|
||||
@@ -38,7 +38,7 @@ extern "C" {
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
||||
|
||||
@@ -59,7 +59,7 @@ extern "C" {
|
||||
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
|
||||
@@ -24,7 +24,7 @@ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
|
||||
|
||||
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
|
||||
typedef std::unique_ptr<ggml_gallocr, ggml_gallocr_deleter> ggml_gallocr_ptr;
|
||||
|
||||
// ggml-backend
|
||||
|
||||
|
||||
@@ -133,6 +133,11 @@ extern "C" {
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -7,6 +7,9 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define RPC_PROTO_MAJOR_VERSION 2
|
||||
#define RPC_PROTO_MINOR_VERSION 0
|
||||
#define RPC_PROTO_PATCH_VERSION 0
|
||||
#define GGML_RPC_MAX_SERVERS 16
|
||||
|
||||
// backend API
|
||||
|
||||
@@ -393,8 +393,8 @@ extern "C" {
|
||||
|
||||
// precision
|
||||
enum ggml_prec {
|
||||
GGML_PREC_DEFAULT,
|
||||
GGML_PREC_F32,
|
||||
GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default
|
||||
GGML_PREC_F32 = 10,
|
||||
};
|
||||
|
||||
// model file types
|
||||
@@ -481,6 +481,7 @@ extern "C" {
|
||||
GGML_OP_CONV_TRANSPOSE_1D,
|
||||
GGML_OP_IM2COL,
|
||||
GGML_OP_IM2COL_BACK,
|
||||
GGML_OP_CONV_2D_DW,
|
||||
GGML_OP_CONV_TRANSPOSE_2D,
|
||||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
@@ -672,11 +673,18 @@ extern "C" {
|
||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||
|
||||
// returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
|
||||
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
||||
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
||||
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
||||
|
||||
// returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
|
||||
GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
|
||||
|
||||
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
|
||||
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
|
||||
@@ -1660,7 +1668,7 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// depthwise
|
||||
// depthwise (via im2col and mul_mat)
|
||||
GGML_API struct ggml_tensor * ggml_conv_2d_dw(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
@@ -1672,6 +1680,22 @@ extern "C" {
|
||||
int d0, // dilation dimension 0
|
||||
int d1); // dilation dimension 1
|
||||
|
||||
// Depthwise 2D convolution
|
||||
// may be faster than ggml_conv_2d_dw, but not available in all backends
|
||||
// a: KW KH 1 C convolution kernel
|
||||
// b: W H C N input data
|
||||
// res: W_out H_out C N
|
||||
GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int stride0,
|
||||
int stride1,
|
||||
int pad0,
|
||||
int pad1,
|
||||
int dilation0,
|
||||
int dilation1);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
||||
@@ -214,7 +214,7 @@ add_library(ggml
|
||||
target_link_libraries(ggml PUBLIC ggml-base)
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
target_link_libraries(ggml PRIVATE dl stdc++fs)
|
||||
target_link_libraries(ggml PRIVATE dl)
|
||||
endif()
|
||||
|
||||
function(ggml_add_backend_library backend)
|
||||
@@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
set(GGML_CPU_TAG_NAME ${tag_name})
|
||||
# other: OPENMP LLAMAFILE CPU_HBM
|
||||
foreach (feat NATIVE
|
||||
SSE42
|
||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||
AMX_TILE AMX_INT8 AMX_BF16)
|
||||
@@ -286,14 +287,16 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||
endif()
|
||||
ggml_add_cpu_backend_variant(sandybridge AVX)
|
||||
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
ggml_add_cpu_backend_variant(x64)
|
||||
ggml_add_cpu_backend_variant(sse42 SSE42)
|
||||
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
|
||||
ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
if (NOT MSVC)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
elseif (GGML_CPU)
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
|
||||
@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
||||
size_t node_size = 0;
|
||||
if (!node->data && !node->view_src) {
|
||||
GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
|
||||
// If we previously had data but don't now then reallocate
|
||||
if (talloc->buffer_id < 0) {
|
||||
return false;
|
||||
}
|
||||
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
|
||||
}
|
||||
return talloc->size_max >= node_size;
|
||||
|
||||
@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
return SIZE_MAX;
|
||||
}
|
||||
|
||||
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
||||
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
|
||||
// get_alloc_size is optional, defaults to ggml_nbytes
|
||||
if (buft->iface.get_alloc_size) {
|
||||
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
||||
@@ -152,7 +152,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
|
||||
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
||||
}
|
||||
|
||||
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
|
||||
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -23,6 +23,7 @@
|
||||
#ifndef CANN_ACLNN_OPS
|
||||
#define CANN_ACLNN_OPS
|
||||
|
||||
#include <functional>
|
||||
#include <aclnnop/aclnn_abs.h>
|
||||
#include <aclnnop/aclnn_neg.h>
|
||||
#include <aclnnop/aclnn_exp.h>
|
||||
@@ -713,6 +714,270 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
*/
|
||||
void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/*
|
||||
* @brief A generic wrapper for ACL resources with custom deleter support.
|
||||
*/
|
||||
using any_acl_resource = std::unique_ptr<void, std::function<void(void*)>>;
|
||||
|
||||
/**
|
||||
* @brief Trait structure used to define how to destroy a given ACL resource type.
|
||||
*
|
||||
* @tparam T ACL resource type.
|
||||
*/
|
||||
template<typename T>
|
||||
struct acl_resource_traits;
|
||||
|
||||
/**
|
||||
* @brief Specialization for aclTensor, defines how to destroy an aclTensor resource.
|
||||
*/
|
||||
template<>
|
||||
struct acl_resource_traits<aclTensor> {
|
||||
static void destroy(void* p) {
|
||||
ACL_CHECK(aclDestroyTensor(static_cast<aclTensor*>(p)));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Specialization for aclIntArray, defines how to destroy an aclIntArray resource.
|
||||
*/
|
||||
template<>
|
||||
struct acl_resource_traits<aclIntArray> {
|
||||
static void destroy(void* p) {
|
||||
ACL_CHECK(aclDestroyIntArray(static_cast<aclIntArray*>(p)));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Specialization for aclScalar, defines how to destroy an aclScalar resource.
|
||||
*/
|
||||
template<>
|
||||
struct acl_resource_traits<aclScalar> {
|
||||
static void destroy(void* p) {
|
||||
ACL_CHECK(aclDestroyScalar(static_cast<aclScalar*>(p)));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Specialization for aclTensorList, defines how to destroy an aclTensorList resource.
|
||||
*/
|
||||
template<>
|
||||
struct acl_resource_traits<aclTensorList> {
|
||||
static void destroy(void* p) {
|
||||
ACL_CHECK(aclDestroyTensorList(static_cast<aclTensorList*>(p)));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Creates a generic ACL resource wrapper with proper destruction logic.
|
||||
*
|
||||
* @tparam T ACL resource type.
|
||||
* @param ptr Raw pointer to ACL resource.
|
||||
* @return any_acl_resource Smart pointer that handles destruction.
|
||||
*/
|
||||
template<typename T>
|
||||
any_acl_resource make_acl_resource(T* ptr) {
|
||||
return any_acl_resource(
|
||||
static_cast<void*>(ptr),
|
||||
[](void* p) {
|
||||
acl_resource_traits<T>::destroy(p);
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Registers multiple ACL resources into a vector for lifetime management.
|
||||
*
|
||||
* @tparam Args Variadic list of ACL resource types.
|
||||
* @param vec Target vector to hold ACL resources.
|
||||
* @param args Raw pointers to ACL resources.
|
||||
*/
|
||||
template<typename... Args>
|
||||
void register_acl_resources(std::vector<any_acl_resource>& vec, Args*... args) {
|
||||
(vec.emplace_back(make_acl_resource(args)), ...);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Task class that wraps the execution of an aclnn function call.
|
||||
*/
|
||||
class aclnn_task : public cann_task {
|
||||
public:
|
||||
aclnn_task(aclnn_func_t aclnn_func, void * workspace_addr,
|
||||
uint64_t workspace_size, aclOpExecutor * executor,
|
||||
aclrtStream stream) :
|
||||
aclnn_func_(aclnn_func),
|
||||
workspace_addr_(workspace_addr),
|
||||
workspace_size_(workspace_size),
|
||||
executor_(executor),
|
||||
stream_(stream) {}
|
||||
virtual void run_task() override {
|
||||
ACL_CHECK(aclnn_func_(workspace_addr_, workspace_size_, executor_, stream_));
|
||||
}
|
||||
private:
|
||||
aclnn_func_t aclnn_func_;
|
||||
void * workspace_addr_;
|
||||
uint64_t workspace_size_;
|
||||
aclOpExecutor * executor_;
|
||||
aclrtStream stream_;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Task class that releases ACL resources after usage.
|
||||
*/
|
||||
class release_resource_task : public cann_task {
|
||||
public:
|
||||
release_resource_task(std::vector<any_acl_resource>&& resources){
|
||||
resource_ = std::move(resources);
|
||||
}
|
||||
|
||||
virtual void run_task() override {
|
||||
resource_.clear();
|
||||
}
|
||||
private:
|
||||
std::vector<any_acl_resource> resource_;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Task class for performing asynchronous memory copy operations.
|
||||
*/
|
||||
class async_memcpy_task : public cann_task {
|
||||
public:
|
||||
async_memcpy_task(void* dst, const void* src, size_t size,
|
||||
aclrtMemcpyKind kind, aclrtStream stream)
|
||||
: dst_(dst), src_(src), size_(size), kind_(kind), stream_(stream) {}
|
||||
|
||||
virtual void run_task() override {
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst_, size_, src_, size_, kind_, stream_));
|
||||
}
|
||||
private:
|
||||
void* dst_;
|
||||
const void* src_;
|
||||
size_t size_;
|
||||
aclrtMemcpyKind kind_;
|
||||
aclrtStream stream_;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Task class for performing asynchronous memory set operations.
|
||||
*/
|
||||
class async_memset_task : public cann_task {
|
||||
public:
|
||||
async_memset_task(void* buffer, size_t size, int32_t value, aclrtStream stream)
|
||||
: buffer_(buffer), size_(size), value_(value), stream_(stream) {}
|
||||
|
||||
virtual void run_task() override {
|
||||
ACL_CHECK(aclrtMemsetAsync(buffer_, size_, value_, size_, stream_));
|
||||
}
|
||||
private:
|
||||
void* buffer_;
|
||||
size_t size_;
|
||||
int32_t value_;
|
||||
aclrtStream stream_;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Launches an asynchronous task using the memory allocator.
|
||||
*
|
||||
* This macro submit an asynchronous task on the specified stream.
|
||||
* The task uses memory allocated by the allocator. It is guaranteed
|
||||
* that the memory will not be accessed by other tasks until this task
|
||||
* completes, due to the sequential execution order within the same stream.
|
||||
*
|
||||
* @param OP_NAME aclnn operator name.
|
||||
* @param args Additional arguments required by the task.
|
||||
*
|
||||
* @note
|
||||
* Memory from the allocator will be "freed" immediately and can be
|
||||
* reallocated to other pointers. However, it won't be accessed by any
|
||||
* other task before this asynchronous task ends, because all tasks in the
|
||||
* same stream are executed in queue order.
|
||||
*/
|
||||
|
||||
#define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...) \
|
||||
do { \
|
||||
uint64_t workspaceSize = 0; \
|
||||
aclOpExecutor * executor; \
|
||||
void * workspaceAddr = nullptr; \
|
||||
ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor));\
|
||||
/* workspace should alloced in main thread to keep malloc order when using vmm. */ \
|
||||
if (workspaceSize > 0) { \
|
||||
ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize); \
|
||||
workspaceAddr = workspace_allocator.get(); \
|
||||
} \
|
||||
if (CTX.async_mode) { \
|
||||
auto task = \
|
||||
std::make_unique<aclnn_task>(aclnn##OP_NAME, workspaceAddr, workspaceSize, \
|
||||
executor, CTX.stream()); \
|
||||
CTX.task_queue.submit_task(std::move(task)); \
|
||||
} else { \
|
||||
ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream()));\
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* @brief Registers and releases multiple ACL resources, optionally deferring the release
|
||||
* using a task.
|
||||
*
|
||||
* @tparam Args Types of the ACL resources.
|
||||
* @param ctx Backend context which manages task submission and async mode.
|
||||
* @param args Pointers to ACL resources to be released.
|
||||
*/
|
||||
template <typename... Args>
|
||||
void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... args) {
|
||||
std::vector<any_acl_resource> resources;
|
||||
register_acl_resources(resources, std::forward<Args>(args)...);
|
||||
if(ctx.async_mode) {
|
||||
auto task = std::make_unique<release_resource_task>(std::move(resources));
|
||||
ctx.task_queue.submit_task(std::move(task));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs an asynchronous memory copy operation, optionally deferred via task submission.
|
||||
*
|
||||
* @param ctx Backend context containing stream and async configuration.
|
||||
* @param dst Destination memory address.
|
||||
* @param src Source memory address.
|
||||
* @param len Size of memory to copy (in bytes).
|
||||
* @param kind Type of memory copy (host-to-device, device-to-host, etc).
|
||||
*/
|
||||
inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx, void * dst,
|
||||
const void * src, size_t len, aclrtMemcpyKind kind) {
|
||||
if (ctx.async_mode) {
|
||||
auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx.stream());
|
||||
ctx.task_queue.submit_task(std::move(task));
|
||||
} else {
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst, len, src, len, kind, ctx.stream()));
|
||||
}
|
||||
}
|
||||
|
||||
inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx, void * dst,
|
||||
const void * src, size_t len, aclrtMemcpyKind kind) {
|
||||
if (ctx->async_mode) {
|
||||
auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx->stream());
|
||||
ctx->task_queue.submit_task(std::move(task));
|
||||
} else {
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst, len, src, len, kind, ctx->stream()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs an asynchronous memory set operation, optionally deferred via task submission.
|
||||
*
|
||||
* @param ctx Backend context containing stream and async configuration.
|
||||
* @param buffer Memory buffer to be set.
|
||||
* @param size Size of the memory buffer (in bytes).
|
||||
* @param value Value to set in the buffer.
|
||||
*/
|
||||
inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffer,
|
||||
size_t size, int value) {
|
||||
if (ctx.async_mode) {
|
||||
auto task = std::make_unique<async_memset_task>(buffer, size, value, ctx.stream());
|
||||
ctx.task_queue.submit_task(std::move(task));
|
||||
} else {
|
||||
ACL_CHECK(aclrtMemsetAsync(buffer, size, value, size, ctx.stream()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Applies a element-wise operation to two input tensors using the CANN
|
||||
* backend.
|
||||
@@ -742,42 +1007,9 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
|
||||
binary_op(ctx, acl_src0, acl_src1, acl_dst);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_src0));
|
||||
ACL_CHECK(aclDestroyTensor(acl_src1));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
ggml_cann_release_resources(ctx, acl_src0, acl_src1, acl_dst);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Launches an asynchronous task using the memory allocator.
|
||||
*
|
||||
* This macro submit an asynchronous task on the specified stream.
|
||||
* The task uses memory allocated by the allocator. It is guaranteed
|
||||
* that the memory will not be accessed by other tasks until this task
|
||||
* completes, due to the sequential execution order within the same stream.
|
||||
*
|
||||
* @param OP_NAME aclnn operator name.
|
||||
* @param args Additional arguments required by the task.
|
||||
*
|
||||
* @note
|
||||
* Memory from the allocator will be "freed" immediately and can be
|
||||
* reallocated to other pointers. However, it won't be accessed by any
|
||||
* other task before this asynchronous task ends, because all tasks in the
|
||||
* same stream are executed in queue order.
|
||||
*/
|
||||
#define GGML_CANN_CALL_ACLNN_OP(OP_NAME, ...) \
|
||||
do { \
|
||||
uint64_t workspaceSize = 0; \
|
||||
aclOpExecutor * executor; \
|
||||
void * workspaceAddr = nullptr; \
|
||||
\
|
||||
ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
|
||||
\
|
||||
if (workspaceSize > 0) { \
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); \
|
||||
workspaceAddr = workspace_allocator.get(); \
|
||||
} \
|
||||
ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream())); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* @brief Applies a unary operation to an input tensor using the CANN backend.
|
||||
@@ -799,9 +1031,7 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
unary_op(ctx, acl_src, acl_dst);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -832,7 +1062,7 @@ void ggml_cann_unary_op(
|
||||
*
|
||||
* Internally, the lambda will call:
|
||||
* @code
|
||||
* GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);
|
||||
* GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
|
||||
* @endcode
|
||||
*
|
||||
* @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
|
||||
@@ -840,14 +1070,14 @@ void ggml_cann_unary_op(
|
||||
* @see ggml_cann_unary_op
|
||||
* @see GGML_CANN_CALL_ACLNN_OP
|
||||
*/
|
||||
#define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
|
||||
do { \
|
||||
auto lambda = [](ggml_backend_cann_context& ctx, \
|
||||
aclTensor* acl_src, \
|
||||
aclTensor* acl_dst) { \
|
||||
GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst); \
|
||||
}; \
|
||||
ggml_cann_unary_op(lambda, ctx, dst); \
|
||||
} \
|
||||
#define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
|
||||
do { \
|
||||
auto lambda = [](ggml_backend_cann_context& ctx, \
|
||||
aclTensor* acl_src, \
|
||||
aclTensor* acl_dst) { \
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
|
||||
}; \
|
||||
ggml_cann_unary_op(lambda, ctx, dst); \
|
||||
} \
|
||||
while (0)
|
||||
#endif // CANN_ACLNN_OPS
|
||||
|
||||
@@ -31,9 +31,16 @@
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <unistd.h>
|
||||
#include <functional>
|
||||
|
||||
#include "../include/ggml-cann.h"
|
||||
#include "../include/ggml.h"
|
||||
#include "../ggml-impl.h"
|
||||
|
||||
#define MATRIX_ROW_PADDING 512
|
||||
#define GGML_CANN_MAX_STREAMS 8
|
||||
@@ -205,6 +212,127 @@ struct ggml_cann_pool_alloc {
|
||||
ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Function pointer type for ACLNN operator calls.
|
||||
*/
|
||||
using aclnn_func_t = aclnnStatus (*)(void*, uint64_t, aclOpExecutor*, aclrtStream);
|
||||
|
||||
/**
|
||||
* @brief Base class for all CANN tasks to be submitted to the task queue.
|
||||
*
|
||||
* Users should override the run_task() method with actual task logic.
|
||||
*/
|
||||
class cann_task {
|
||||
public:
|
||||
virtual void run_task() {}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief A lock-free ring-buffer based task queue for asynchronously executing cann_task instances.
|
||||
*/
|
||||
class cann_task_queue {
|
||||
public:
|
||||
/**
|
||||
* @brief Constructs a task queue with a fixed power-of-two capacity for a specific device.
|
||||
*
|
||||
* @param capacity Queue capacity. Must be a power of 2.
|
||||
* @param device Target device ID (used for context setting).
|
||||
*/
|
||||
explicit cann_task_queue(size_t capacity, int32_t device)
|
||||
: buffer_(capacity), capacity_(capacity), head_(0), tail_(0),
|
||||
running_(false), device_(device) {
|
||||
GGML_ASSERT((capacity & (capacity - 1)) == 0 && "capacity must be power of 2");
|
||||
mask_ = capacity_ - 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Attempts to enqueue a task into the queue.
|
||||
*
|
||||
* @param item Unique pointer to the task.
|
||||
* @return true if the task was successfully enqueued, false if the queue was full.
|
||||
*/
|
||||
bool enqueue(std::unique_ptr<cann_task>&& item) {
|
||||
size_t next_tail = (tail_ + 1) & mask_;
|
||||
|
||||
if (next_tail == head_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
buffer_[tail_] = std::move(item);
|
||||
std::atomic_thread_fence(std::memory_order_release);
|
||||
tail_ = next_tail;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Submits a task to the queue, and starts the worker thread if not already running.
|
||||
*
|
||||
* @param task Task to be submitted.
|
||||
*/
|
||||
void submit_task(std::unique_ptr<cann_task>&& task) {
|
||||
while(!enqueue(std::move(task))) {
|
||||
std::this_thread::yield();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!running_) {
|
||||
running_ = true;
|
||||
thread_ = std::thread(&cann_task_queue::execute, this);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Waits until the queue is completely empty and no tasks are being processed.
|
||||
*/
|
||||
void wait() {
|
||||
while (running_ && head_ != tail_) {
|
||||
std::this_thread::yield();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Stops the task queue and joins the worker thread.
|
||||
*/
|
||||
void stop() {
|
||||
running_ = false;
|
||||
if (thread_.joinable()) {
|
||||
thread_.join();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief Worker thread function that continuously dequeues and executes tasks.
|
||||
*/
|
||||
void execute() {
|
||||
ggml_cann_set_device(device_);
|
||||
|
||||
while (running_) {
|
||||
if(head_ == tail_) {
|
||||
std::this_thread::yield();
|
||||
continue;
|
||||
}
|
||||
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
buffer_[head_]->run_task();
|
||||
buffer_[head_].reset();
|
||||
head_ = (head_ + 1) & mask_;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<cann_task>> buffer_;
|
||||
const size_t capacity_;
|
||||
size_t mask_;
|
||||
size_t head_;
|
||||
size_t tail_;
|
||||
bool running_;
|
||||
std::thread thread_;
|
||||
int32_t device_;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Context for managing CANN backend operations.
|
||||
*/
|
||||
@@ -213,6 +341,8 @@ struct ggml_backend_cann_context {
|
||||
std::string name; /**< Name of the device. */
|
||||
std::string description; /**< Description of the device. */
|
||||
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
|
||||
cann_task_queue task_queue;
|
||||
bool async_mode;
|
||||
|
||||
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
|
||||
|
||||
@@ -221,9 +351,12 @@ struct ggml_backend_cann_context {
|
||||
* @param device Device ID.
|
||||
*/
|
||||
explicit ggml_backend_cann_context(int device)
|
||||
: device(device), name("CANN" + std::to_string(device)) {
|
||||
: device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) {
|
||||
ggml_cann_set_device(device);
|
||||
description = aclrtGetSocName();
|
||||
async_mode = (getenv("GGML_CANN_ASYNC_MODE") != nullptr);
|
||||
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
|
||||
device, async_mode ? "ON" : "OFF");
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -231,6 +364,7 @@ struct ggml_backend_cann_context {
|
||||
*/
|
||||
~ggml_backend_cann_context() {
|
||||
ggml_cann_set_device(device);
|
||||
task_queue.stop();
|
||||
if (copy_event != nullptr) {
|
||||
ACL_CHECK(aclrtDestroyEvent(copy_event));
|
||||
}
|
||||
|
||||
@@ -1606,7 +1606,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
auto lambda = [](ggml_backend_cann_context& ctx,
|
||||
aclTensor* acl_src,
|
||||
aclTensor* acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
||||
};
|
||||
ggml_cann_unary_op(lambda, ctx, dst);
|
||||
} break;
|
||||
@@ -1789,12 +1789,11 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
|
||||
delete backend;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Sets tensor data asynchronously in the CANN backend.
|
||||
*
|
||||
* This function asynchronously sets tensor data in the CANN backend. Depending
|
||||
* on the tensor type, it may perform data transformations before copying data
|
||||
* to the device.
|
||||
* This function asynchronously sets tensor data in the CANN backend.
|
||||
*
|
||||
* @param backend Pointer to the CANN backend structure.
|
||||
* @param tensor Pointer to the tensor structure to set data for.
|
||||
@@ -1809,23 +1808,28 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
||||
size_t size) {
|
||||
ggml_backend_cann_context *cann_ctx =
|
||||
(ggml_backend_cann_context *)backend->context;
|
||||
ggml_backend_buffer_t buf =
|
||||
tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
if (!need_transform(tensor->type)) {
|
||||
ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data,
|
||||
size, ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
cann_ctx->stream()));
|
||||
} else {
|
||||
void *transform_buffer = malloc(size);
|
||||
ggml_backend_cann_transform(tensor, data, transform_buffer);
|
||||
GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
|
||||
"unsupported buffer type");
|
||||
GGML_ASSERT(!ggml_is_quantized(tensor->type));
|
||||
|
||||
ACL_CHECK(aclrtMemcpyAsync(
|
||||
(char *)tensor->data + offset, size, transform_buffer, size,
|
||||
ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
|
||||
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
||||
free(transform_buffer);
|
||||
}
|
||||
ggml_cann_async_memcpy(cann_ctx, (char *)tensor->data + offset, data, size,
|
||||
ACL_MEMCPY_HOST_TO_DEVICE);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gets tensor data asynchronously in the CANN backend.
|
||||
*
|
||||
* This function asynchronously gets tensor data in the CANN backend.
|
||||
*
|
||||
* @param backend Pointer to the CANN backend structure.
|
||||
* @param tensor Pointer to the tensor structure to get data from.
|
||||
* @param data Pointer to the host data to copy from the tensor.
|
||||
* @param offset Offset in bytes within the host data.
|
||||
* @param size Size of the data to copy in bytes.
|
||||
*/
|
||||
static void ggml_backend_cann_get_tensor_async(
|
||||
ggml_backend_t backend, const ggml_tensor *tensor, void *data,
|
||||
size_t offset, size_t size) {
|
||||
@@ -1836,20 +1840,11 @@ static void ggml_backend_cann_get_tensor_async(
|
||||
|
||||
GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
|
||||
"unsupported buffer type");
|
||||
GGML_ASSERT(!ggml_is_quantized(tensor->type));
|
||||
|
||||
ggml_cann_async_memcpy(cann_ctx, data, (char *)tensor->data + offset, size,
|
||||
ACL_MEMCPY_DEVICE_TO_HOST);
|
||||
|
||||
if (!need_transform(tensor->type)) {
|
||||
ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset,
|
||||
size, ACL_MEMCPY_DEVICE_TO_HOST,
|
||||
cann_ctx->stream()));
|
||||
} else {
|
||||
void *transform_buffer = malloc(size);
|
||||
ACL_CHECK(aclrtMemcpyAsync(
|
||||
transform_buffer, size, (char *)tensor->data + offset, size,
|
||||
ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream()));
|
||||
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
||||
ggml_backend_cann_transform_back(tensor, transform_buffer, data);
|
||||
free(transform_buffer);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1909,6 +1904,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
||||
ggml_cann_set_device(cann_ctx_src->device);
|
||||
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
|
||||
|
||||
// wait for task_queue empty to keep task order.
|
||||
cann_ctx_src->task_queue.wait();
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE,
|
||||
cann_ctx_src->stream()));
|
||||
@@ -1936,9 +1933,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
||||
static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
||||
ggml_backend_cann_context* cann_ctx =
|
||||
(ggml_backend_cann_context*)backend->context;
|
||||
|
||||
cann_ctx->task_queue.wait();
|
||||
ggml_cann_set_device(cann_ctx->device);
|
||||
|
||||
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
||||
}
|
||||
|
||||
|
||||
@@ -222,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
elseif (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX)
|
||||
else ()
|
||||
elseif (GGML_SSE42)
|
||||
list(APPEND ARCH_FLAGS /arch:SSE4.2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
||||
endif()
|
||||
@@ -237,8 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_NATIVE)
|
||||
list(APPEND ARCH_FLAGS -march=native)
|
||||
else ()
|
||||
list(APPEND ARCH_FLAGS -msse4.2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
||||
if (GGML_SSE42)
|
||||
list(APPEND ARCH_FLAGS -msse4.2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
||||
endif()
|
||||
if (GGML_F16C)
|
||||
list(APPEND ARCH_FLAGS -mf16c)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_F16C)
|
||||
@@ -350,10 +352,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
# TODO: Separation to determine activation of VX/VXE/VXE2
|
||||
if (${S390X_M} MATCHES "8561|8562")
|
||||
message(STATUS "z15 target")
|
||||
list(APPEND ARCH_FLAGS -march=z15 -mtune=z15)
|
||||
list(APPEND ARCH_FLAGS -march=z15)
|
||||
elseif (${S390X_M} MATCHES "3931")
|
||||
message(STATUS "z16 target")
|
||||
list(APPEND ARCH_FLAGS -march=z16 -mtune=z16)
|
||||
list(APPEND ARCH_FLAGS -march=z16)
|
||||
elseif (${S390X_M} MATCHES "9175|9176")
|
||||
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
|
||||
message(STATUS "z17 target")
|
||||
list(APPEND ARCH_FLAGS -march=z17)
|
||||
else()
|
||||
message(STATUS "Unknown target")
|
||||
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
|
||||
|
||||
@@ -263,7 +263,7 @@ void test_x86_is() {
|
||||
static int ggml_backend_cpu_x86_score() {
|
||||
// FIXME: this does not check for OS support
|
||||
|
||||
int score = 0;
|
||||
int score = 1;
|
||||
cpuid_x86 is;
|
||||
|
||||
#ifdef GGML_FMA
|
||||
|
||||
@@ -72,8 +72,6 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
#elif defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
@@ -20,12 +20,6 @@
|
||||
#define GROUP_MAX_EPS_IQ1_M 1e-7f
|
||||
#define GROUP_MAX_EPS_IQ1_S 1e-12f
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid warnings for hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
||||
@@ -6596,7 +6590,118 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
#elif defined(__VXE__) || defined(__VXE2__)
|
||||
uint32_t aux[3];
|
||||
uint32_t utmp[4];
|
||||
|
||||
const int32x4_t v_z = vec_splat_s32(0);
|
||||
const uint8x16_t v_3m = vec_splat_u8(0x03);
|
||||
|
||||
const uint8x16_t v_0c = vec_splat_u8(1);
|
||||
const uint8x16_t v_1c = vec_sl(v_0c, 1);
|
||||
const uint8x16_t v_2c = vec_sl(v_0c, 2);
|
||||
const uint8x16_t v_3c = vec_sl(v_0c, 3);
|
||||
|
||||
uint8x16_t q3h[4];
|
||||
uint8x16_t q3b[2];
|
||||
int8x16_t q3bytes[4];
|
||||
int8x16_t q8bytes[4];
|
||||
uint8x16_t qhbits[2];
|
||||
|
||||
float sum = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
|
||||
const uint8_t * restrict x0l = x[i].qs;
|
||||
const uint8_t * restrict x0h = x[i].hmask;
|
||||
const int8_t * restrict y0 = y[i].qs;
|
||||
|
||||
qhbits[0] = vec_xl(0 , x0h);
|
||||
qhbits[1] = vec_xl(16, x0h);
|
||||
|
||||
int32_t isum = 0;
|
||||
|
||||
memcpy(aux, x[i].scales, 12);
|
||||
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
||||
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
|
||||
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
|
||||
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
|
||||
|
||||
int8_t * scale = (int8_t *)utmp;
|
||||
for (int j = 0; j < 16; ++j) scale[j] -= 32;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
int32x4_t isum0, isum1, isum2, isum3;
|
||||
|
||||
q3b[0] = vec_xl(0 , x0l);
|
||||
q3b[1] = vec_xl(16, x0l);
|
||||
x0l += 32;
|
||||
|
||||
q8bytes[0] = vec_xl(0 , y0);
|
||||
q8bytes[1] = vec_xl(16 , y0);
|
||||
q8bytes[2] = vec_xl(32 , y0);
|
||||
q8bytes[3] = vec_xl(48 , y0);
|
||||
q8bytes[4] = vec_xl(64 , y0);
|
||||
q8bytes[5] = vec_xl(80 , y0);
|
||||
q8bytes[6] = vec_xl(96 , y0);
|
||||
q8bytes[7] = vec_xl(112, y0);
|
||||
y0 += 128;
|
||||
|
||||
q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2);
|
||||
q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2);
|
||||
q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1);
|
||||
q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1);
|
||||
|
||||
q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]);
|
||||
q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]);
|
||||
q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]);
|
||||
q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]);
|
||||
|
||||
isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]);
|
||||
isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]);
|
||||
isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]);
|
||||
isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]);
|
||||
|
||||
isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
|
||||
isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
|
||||
isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
|
||||
isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
|
||||
|
||||
scale += 4;
|
||||
|
||||
q3h[0] = vec_andc(v_2c, qhbits[0]);
|
||||
q3h[1] = vec_andc(v_2c, qhbits[1]);
|
||||
q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1);
|
||||
q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1);
|
||||
|
||||
q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]);
|
||||
q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]);
|
||||
q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]);
|
||||
q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]);
|
||||
|
||||
isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]);
|
||||
isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]);
|
||||
isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
|
||||
isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
|
||||
|
||||
isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
|
||||
isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
|
||||
isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
|
||||
isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
|
||||
|
||||
scale += 4;
|
||||
|
||||
if (j == 0) {
|
||||
qhbits[0] = vec_sr(qhbits[0], 4);
|
||||
qhbits[1] = vec_sr(qhbits[1], 4);
|
||||
}
|
||||
}
|
||||
|
||||
sum += d * isum;
|
||||
}
|
||||
|
||||
*s = sum;
|
||||
#else
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
|
||||
@@ -50,19 +50,6 @@
|
||||
#include "llamafile/sgemm.h"
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
|
||||
// disable POSIX deprecation warnings
|
||||
// these functions are never going away, anyway
|
||||
#pragma warning(disable: 4996)
|
||||
|
||||
// unreachable code because of multiple instances of code after GGML_ABORT
|
||||
#pragma warning(disable: 4702)
|
||||
#endif
|
||||
|
||||
// Note: once we move threading into a separate C++ file
|
||||
// will use std::hardware_destructive_interference_size instead of hardcoding it here
|
||||
// and we'll use C++ attribute syntax.
|
||||
@@ -215,7 +202,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_F16] = {
|
||||
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
||||
.vec_dot_type = GGML_TYPE_F16,
|
||||
.nrows = 1,
|
||||
@@ -356,7 +343,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
.from_float = quantize_row_q8_K,
|
||||
},
|
||||
[GGML_TYPE_BF16] = {
|
||||
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
||||
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
||||
.vec_dot_type = GGML_TYPE_BF16,
|
||||
.nrows = 1,
|
||||
@@ -1932,6 +1919,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_im2col_back_f32(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
{
|
||||
ggml_compute_forward_conv_2d_dw(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
ggml_compute_forward_conv_transpose_2d(params, tensor);
|
||||
@@ -2268,6 +2259,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
} break;
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_IM2COL_BACK:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
@@ -3161,6 +3153,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
|
||||
return ggml_graph_compute(cgraph, &cplan);
|
||||
}
|
||||
|
||||
void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__F16C__)
|
||||
#if defined(__AVX512F__)
|
||||
for (; i + 15 < n; i += 16) {
|
||||
__m512 x_vec = _mm512_loadu_ps(x + i);
|
||||
__m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm256_storeu_si256((__m256i *)(y + i), y_vec);
|
||||
}
|
||||
#endif
|
||||
for (; i + 7 < n; i += 8) {
|
||||
__m256 x_vec = _mm256_loadu_ps(x + i);
|
||||
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm_storeu_si128((__m128i *)(y + i), y_vec);
|
||||
}
|
||||
for (; i + 3 < n; i += 4) {
|
||||
__m128 x_vec = _mm_loadu_ps(x + i);
|
||||
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
||||
}
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__F16C__)
|
||||
#if defined(__AVX512F__)
|
||||
for (; i + 15 < n; i += 16) {
|
||||
__m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
|
||||
__m512 y_vec = _mm512_cvtph_ps(x_vec);
|
||||
_mm512_storeu_ps(y + i, y_vec);
|
||||
}
|
||||
#endif
|
||||
for (; i + 7 < n; i += 8) {
|
||||
__m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
|
||||
__m256 y_vec = _mm256_cvtph_ps(x_vec);
|
||||
_mm256_storeu_ps(y + i, y_vec);
|
||||
}
|
||||
for (; i + 3 < n; i += 4) {
|
||||
__m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
|
||||
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
||||
_mm_storeu_ps(y + i, y_vec);
|
||||
}
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_BF16(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX512F__)
|
||||
for (; i + 15 < n; i += 16) {
|
||||
_mm512_storeu_ps(y + i,
|
||||
_mm512_castsi512_ps(
|
||||
_mm512_slli_epi32(
|
||||
_mm512_cvtepu16_epi32(
|
||||
_mm256_loadu_si256(
|
||||
(const __m256i *)(x + i))),
|
||||
16)));
|
||||
}
|
||||
#endif
|
||||
for (; i + 7 < n; i += 8) {
|
||||
_mm256_storeu_ps(y + i,
|
||||
_mm256_castsi256_ps(
|
||||
_mm256_slli_epi32(
|
||||
_mm256_cvtepu16_epi32(
|
||||
_mm_loadu_si128(
|
||||
(const __m128i *)(x + i))),
|
||||
16)));
|
||||
}
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
y[i] = GGML_BF16_TO_FP32(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int ggml_cpu_has_avx(void) {
|
||||
#if defined(__AVX__)
|
||||
|
||||
@@ -11,24 +11,26 @@
|
||||
#include <vector>
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
#include "ggml-cpu-hbm.h"
|
||||
# include "ggml-cpu-hbm.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_KLEIDIAI
|
||||
#include "kleidiai/kleidiai.h"
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
# include "kleidiai/kleidiai.h"
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
#else
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
#include <windows.h>
|
||||
|
||||
#if defined(__APPLE__)
|
||||
# include <sys/sysctl.h>
|
||||
# include <sys/types.h>
|
||||
#endif
|
||||
|
||||
// ggml-backend interface
|
||||
@@ -70,8 +72,10 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_ty
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
if (extra && extra == buft) return true;
|
||||
for (auto * extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
if (extra && extra == buft) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -330,9 +334,18 @@ static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t d
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
// TODO
|
||||
*free = 0;
|
||||
*total = 0;
|
||||
#ifdef _WIN32
|
||||
MEMORYSTATUSEX status;
|
||||
status.dwLength = sizeof(status);
|
||||
GlobalMemoryStatusEx(&status);
|
||||
*total = status.ullTotalPhys;
|
||||
*free = status.ullAvailPhys;
|
||||
#else
|
||||
long pages = sysconf(_SC_PHYS_PAGES);
|
||||
long page_size = sysconf(_SC_PAGE_SIZE);
|
||||
*total = pages * page_size;
|
||||
*free = *total;
|
||||
#endif
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
@@ -1054,6 +1054,493 @@ class tinyBLAS_Q0_AVX {
|
||||
} \
|
||||
} \
|
||||
|
||||
template <typename TA, typename TB, typename TC>
|
||||
class tinyBLAS_BF16_PPC {
|
||||
public:
|
||||
tinyBLAS_BF16_PPC(int64_t k,
|
||||
const TA *A, int64_t lda,
|
||||
const TB *B, int64_t ldb,
|
||||
TC *C, int64_t ldc,
|
||||
int ith, int nth)
|
||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||
}
|
||||
|
||||
void matmul(int64_t m, int64_t n) {
|
||||
mnpack(0, m, 0, n);
|
||||
}
|
||||
|
||||
private:
|
||||
void vector_permute_store(vec_t *c, int numVec, unsigned char *vecOffset) {
|
||||
vec_t t[8], s[8];
|
||||
vec_t swiz1 = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23};
|
||||
vec_t swiz2 = {8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31};
|
||||
vec_t swiz3 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
||||
vec_t swiz4 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||
|
||||
if (numVec == 2) {
|
||||
t[0] = vec_perm(c[0], c[1], swiz1);
|
||||
t[1] = vec_perm(c[2], c[3], swiz1);
|
||||
s[0] = vec_perm(t[0], t[1], swiz3);
|
||||
s[1] = vec_perm(t[0], t[1], swiz4);
|
||||
vec_xst(s[0], 0, (vec_t*)vecOffset);
|
||||
vec_xst(s[1], 0, (vec_t*)(vecOffset + 16));
|
||||
} else if (numVec == 4) {
|
||||
t[0] = vec_perm(c[0], c[1], swiz1);
|
||||
t[1] = vec_perm(c[0], c[1], swiz2);
|
||||
t[2] = vec_perm(c[2], c[3], swiz1);
|
||||
t[3] = vec_perm(c[2], c[3], swiz2);
|
||||
s[0] = vec_perm(t[0], t[2], swiz3);
|
||||
s[1] = vec_perm(t[0], t[2], swiz4);
|
||||
s[2] = vec_perm(t[1], t[3], swiz3);
|
||||
s[3] = vec_perm(t[1], t[3], swiz4);
|
||||
for (int i = 0; i < 4; ++i)
|
||||
vec_xst(s[i], 0, (vec_t*)(vecOffset + i * 16));
|
||||
} else if (numVec == 8) {
|
||||
for (int i = 0; i < 4; i += 2) {
|
||||
t[i+0] = vec_perm(c[i+0], c[i+1], swiz1);
|
||||
t[i+1] = vec_perm(c[i+0], c[i+1], swiz2);
|
||||
}
|
||||
for (int i = 4; i < 8; i += 2) {
|
||||
t[i+0] = vec_perm(c[i+0], c[i+1], swiz1);
|
||||
t[i+1] = vec_perm(c[i+0], c[i+1], swiz2);
|
||||
}
|
||||
s[0] = vec_perm(t[0], t[2], swiz3);
|
||||
s[1] = vec_perm(t[0], t[2], swiz4);
|
||||
s[2] = vec_perm(t[1], t[3], swiz3);
|
||||
s[3] = vec_perm(t[1], t[3], swiz4);
|
||||
s[4] = vec_perm(t[4], t[6], swiz3);
|
||||
s[5] = vec_perm(t[4], t[6], swiz4);
|
||||
s[6] = vec_perm(t[5], t[7], swiz3);
|
||||
s[7] = vec_perm(t[5], t[7], swiz4);
|
||||
for (int i = 0; i < 8; ++i)
|
||||
vec_xst(s[i], 0, (vec_t*)(vecOffset + i * 16));
|
||||
}
|
||||
}
|
||||
|
||||
void packNormal(const TA* a, int64_t lda, int rows, int cols, unsigned char* vec) {
|
||||
int64_t i, j;
|
||||
TA *aoffset = NULL;
|
||||
unsigned char *vecOffset = NULL;
|
||||
TA * aoffsets[8];
|
||||
vector unsigned char c_arr[8];
|
||||
aoffset = const_cast<TA*>(a);
|
||||
vecOffset = vec;
|
||||
j = (rows >> 3);
|
||||
if (j > 0) {
|
||||
do {
|
||||
if (cols == 4) {
|
||||
aoffsets[0] = aoffset;
|
||||
for (int it = 1; it < 4; ++it)
|
||||
aoffsets[it] = aoffsets[it-1] + lda;
|
||||
aoffset += 4 * lda;
|
||||
for (int i = 0; i < 4; ++i)
|
||||
c_arr[i] = vec_xl(0, (vector unsigned char*)aoffsets[i]);
|
||||
vector_permute_store(c_arr, 4, vecOffset);
|
||||
for (int i = 0; i<4; i++)
|
||||
aoffsets[i] = aoffsets[i]+lda;
|
||||
vecOffset +=64;
|
||||
}
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
aoffsets[0] = aoffset;
|
||||
for (int it = 1; it < 8; ++it) {
|
||||
aoffsets[it] = aoffsets[it-1] + lda;
|
||||
}
|
||||
aoffset += 8 * lda;
|
||||
do {
|
||||
for (int it = 0; it < 8; ++it)
|
||||
c_arr[it] = vec_xl(0, (vector unsigned char*)aoffsets[it]);
|
||||
vector_permute_store(c_arr, 8, vecOffset);
|
||||
for (int it = 0; it < 8; ++it)
|
||||
aoffsets[it] = aoffsets[it] + 8*lda;
|
||||
vecOffset += 128;
|
||||
i--;
|
||||
} while(i > 0);
|
||||
}
|
||||
j--;
|
||||
} while(j > 0);
|
||||
}
|
||||
if (rows & 4) {
|
||||
aoffsets[0] = aoffset;
|
||||
for (int it = 1; it < 4; ++it)
|
||||
aoffsets[it] = aoffsets[it-1] + lda;
|
||||
aoffset += 4 * lda;
|
||||
if (cols == 4) {
|
||||
for (int it = 0; it < 4; ++it)
|
||||
c_arr[it] = vec_xl(0, (vector unsigned char*)aoffsets[it]);
|
||||
vector_permute_store(c_arr, 2, vecOffset);
|
||||
for (int it = 0; it< 4; it++)
|
||||
aoffsets[it] = aoffsets[it] + lda;
|
||||
vecOffset += 32;
|
||||
}
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
do {
|
||||
for (int it = 0; it < 4; ++it)
|
||||
c_arr[it] = vec_xl(0, (vector unsigned char*)aoffsets[it]);
|
||||
vector_permute_store(c_arr, 4, vecOffset);
|
||||
for (int it = 0; it< 4; it++)
|
||||
aoffsets[it] = aoffsets[it] + 8*lda;
|
||||
vecOffset += 64;
|
||||
i--;
|
||||
} while(i > 0);
|
||||
}
|
||||
}
|
||||
if (rows & 3) {
|
||||
aoffsets[0] = aoffset;
|
||||
for (int it = 1; it < 4; ++it)
|
||||
aoffsets[it] = aoffsets[it-1] + lda;
|
||||
if (cols == 4) {
|
||||
switch(rows) {
|
||||
case 3: c_arr[2] = vec_xl(0, (vector unsigned char*)aoffsets[2]);
|
||||
case 2: c_arr[1] = vec_xl(0, (vector unsigned char*)aoffsets[1]);
|
||||
case 1: c_arr[0] = vec_xl(0, (vector unsigned char*)aoffsets[0]);
|
||||
break;
|
||||
}
|
||||
vector_permute_store(c_arr, 2, vecOffset);
|
||||
for (int it = 0; it< 4; it++)
|
||||
aoffsets[it] = aoffsets[it] + lda;
|
||||
vecOffset += 32;
|
||||
}
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
do {
|
||||
switch(rows) {
|
||||
case 3: c_arr[2] = vec_xl(0, (vector unsigned char*)aoffsets[2]);
|
||||
case 2: c_arr[1] = vec_xl(0, (vector unsigned char*)aoffsets[1]);
|
||||
case 1: c_arr[0] = vec_xl(0, (vector unsigned char*)aoffsets[0]);
|
||||
break;
|
||||
}
|
||||
vector_permute_store(c_arr, 4, vecOffset);
|
||||
for (int it = 0; it <4; it++)
|
||||
aoffsets[it] = aoffsets[it] + 8* lda;
|
||||
vecOffset += 64;
|
||||
i--;
|
||||
} while(i > 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t mc, nc, mp, np;
|
||||
int m_rem = MIN(m - m0, 8);
|
||||
int n_rem = MIN(n - n0, 8);
|
||||
|
||||
if (m_rem >= 8 && n_rem >= 8) {
|
||||
mc = 8;
|
||||
nc = 8;
|
||||
gemm<8,8>(m0, m, n0, n);
|
||||
} else if (m_rem >= 4 && n_rem >= 8) {
|
||||
mc = 4;
|
||||
nc = 8;
|
||||
gemm<4,8>(m0, m, n0, n);
|
||||
} else if (m_rem >=8 && n_rem >=4){
|
||||
mc = 8;
|
||||
nc = 4;
|
||||
gemm<8,4>(m0, m, n0, n);
|
||||
} else if ((m_rem < 4) && (n_rem >= 8)) {
|
||||
nc = 8;
|
||||
switch(m_rem) {
|
||||
case 1:
|
||||
mc = 1;
|
||||
gemm_Mx8<1>(m0, m, n0, n);
|
||||
break;
|
||||
case 2:
|
||||
mc = 2;
|
||||
gemm_Mx8<2>(m0, m, n0, n);
|
||||
break;
|
||||
case 3:
|
||||
mc = 3;
|
||||
gemm_Mx8<3>(m0, m, n0, n);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
} else if (m_rem >= 4 && n_rem >= 4) {
|
||||
mc = 4;
|
||||
nc = 4;
|
||||
gemm_small<4, 4>(m0, m, n0, n);
|
||||
} else if ((m_rem > 4) && (n_rem < 4)) {
|
||||
mc = 4;
|
||||
switch(n_rem) {
|
||||
case 1:
|
||||
nc = 1;
|
||||
gemm_small<4, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 2:
|
||||
nc = 2;
|
||||
gemm_small<4, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 3:
|
||||
nc = 3;
|
||||
gemm_small<4, 3>(m0, m, n0, n);
|
||||
break;
|
||||
|
||||
default:
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
switch((m_rem << 4) | n_rem) {
|
||||
case 0x43:
|
||||
mc = 4;
|
||||
nc = 3;
|
||||
gemm_small<4, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x42:
|
||||
mc = 4;
|
||||
nc = 2;
|
||||
gemm_small<4, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x41:
|
||||
mc = 4;
|
||||
nc = 1;
|
||||
gemm_small<4, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x34:
|
||||
mc = 3;
|
||||
nc = 4;
|
||||
gemm_small<3, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x33:
|
||||
mc = 3;
|
||||
nc = 3;
|
||||
gemm_small<3, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x32:
|
||||
mc = 3;
|
||||
nc = 2;
|
||||
gemm_small<3, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x31:
|
||||
mc = 3;
|
||||
nc = 1;
|
||||
gemm_small<3, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x24:
|
||||
mc = 2;
|
||||
nc = 4;
|
||||
gemm_small<2,4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x23:
|
||||
mc = 2;
|
||||
nc = 3;
|
||||
gemm_small<2, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x22:
|
||||
mc = 2;
|
||||
nc = 2;
|
||||
gemm_small<2, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x21:
|
||||
mc = 2;
|
||||
nc = 1;
|
||||
gemm_small<2, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x14:
|
||||
mc = 1;
|
||||
nc = 4;
|
||||
gemm_small<1, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x13:
|
||||
mc = 1;
|
||||
nc = 3;
|
||||
gemm_small<1, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x12:
|
||||
mc = 1;
|
||||
nc = 2;
|
||||
gemm_small<1, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x11:
|
||||
mc = 1;
|
||||
nc = 1;
|
||||
gemm_small<1, 1>(m0, m, n0, n);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}
|
||||
mp = m0 + (m - m0) / mc * mc;
|
||||
np = n0 + (n - n0) / nc * nc;
|
||||
mnpack(mp, m, n0, np);
|
||||
mnpack(m0, m, np, n);
|
||||
}
|
||||
|
||||
void KERNEL_4x8(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[4], vec_B[8] , vec_C[4];
|
||||
acc_t acc_0, acc_1;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
for (int l = 0; l < k; l+=8) {
|
||||
packNormal((A+(ii*lda)+l), lda, 4, 8, (uint8_t*)vec_A);
|
||||
packNormal((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B);
|
||||
for (int x = 0; x < 4; x++) {
|
||||
__builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvbf16ger2pp(&acc_1, vec_A[x], vec_B[x+4]);
|
||||
}
|
||||
}
|
||||
SAVE_ACC(&acc_0, ii, jj);
|
||||
SAVE_ACC(&acc_1, ii, jj+4);
|
||||
}
|
||||
|
||||
void KERNEL_8x4(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[8], vec_B[4] , vec_C[4];
|
||||
acc_t acc_0, acc_1;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
for (int l = 0; l < k; l+=8) {
|
||||
packNormal((A+(ii*lda)+l), lda, 8, 8, (uint8_t*)vec_A);
|
||||
packNormal((B+(jj*ldb)+l), ldb, 8, 4, (uint8_t*)vec_B);
|
||||
for (int x = 0; x < 4; x++) {
|
||||
__builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvbf16ger2pp(&acc_1, vec_A[x+4], vec_B[x]);
|
||||
}
|
||||
}
|
||||
SAVE_ACC(&acc_0, ii, jj);
|
||||
SAVE_ACC(&acc_1, ii+4, jj);
|
||||
}
|
||||
|
||||
|
||||
void KERNEL_8x8(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[8], vec_B[8], vec_C[4];
|
||||
acc_t acc_0, acc_1, acc_2, acc_3;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
__builtin_mma_xxsetaccz(&acc_2);
|
||||
__builtin_mma_xxsetaccz(&acc_3);
|
||||
for (int l = 0; l < k; l+=8) {
|
||||
packNormal(A+(ii*lda)+l, lda, 8, 8, (uint8_t*)vec_A);
|
||||
packNormal(B+(jj*ldb)+l, ldb, 8, 8, (uint8_t*)vec_B);
|
||||
for (int x = 0; x < 4; x++) {
|
||||
__builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvbf16ger2pp(&acc_1, (vec_t)vec_A[x], (vec_t)vec_B[x+4]);
|
||||
__builtin_mma_xvbf16ger2pp(&acc_2, (vec_t)vec_A[x+4], (vec_t)vec_B[x]);
|
||||
__builtin_mma_xvbf16ger2pp(&acc_3, (vec_t)vec_A[x+4], (vec_t)vec_B[x+4]);
|
||||
}
|
||||
}
|
||||
|
||||
SAVE_ACC(&acc_0, ii, jj);
|
||||
SAVE_ACC(&acc_1, ii, jj+4);
|
||||
SAVE_ACC(&acc_2, ii+4, jj);
|
||||
SAVE_ACC(&acc_3, ii+4, jj+4);
|
||||
}
|
||||
|
||||
template<int RM, int RN>
|
||||
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
vec_t vec_C[4];
|
||||
acc_t acc_0;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
vec_t vec_A[2], vec_B[2];
|
||||
for (int l=0; l<k; l+=4) {
|
||||
packNormal(A+(ii*lda)+l, lda, RM, 4, (uint8_t*)vec_A);
|
||||
packNormal(B+(jj*ldb)+l, ldb, RN, 4, (uint8_t*)vec_B);
|
||||
for (int x = 0; x<2; x++) {
|
||||
__builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
}
|
||||
}
|
||||
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < RN; J++) {
|
||||
*((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int RM>
|
||||
void gemm_Mx8(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int RN = 8;
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
vec_t vec_C[4];
|
||||
acc_t acc_0, acc_1;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
vec_t vec_A[4], vec_B[8];
|
||||
for (int l=0; l<k; l+=8) {
|
||||
packNormal(A+(ii*lda)+l, lda, RM, 8, (uint8_t*)vec_A);
|
||||
packNormal(B+(jj*ldb)+l, ldb, RN, 8, (uint8_t*)vec_B);
|
||||
for (int x = 0; x<4; x++) {
|
||||
__builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvbf16ger2pp(&acc_1, vec_A[x], vec_B[x+4]);
|
||||
}
|
||||
}
|
||||
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < 4; J++) {
|
||||
*((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
|
||||
}
|
||||
}
|
||||
__builtin_mma_disassemble_acc(vec_C, &acc_1);
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < 4; J++) {
|
||||
*((TC*)(C+ii+((jj+4+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int RM, int RN>
|
||||
inline void kernel(int64_t ii, int64_t jj) {
|
||||
if constexpr(RM == 4 && RN == 8) {
|
||||
KERNEL_4x8(ii,jj);
|
||||
} else if constexpr(RM == 8 && RN == 8) {
|
||||
KERNEL_8x8(ii,jj);
|
||||
} else if constexpr(RM == 8 && RN == 4) {
|
||||
KERNEL_8x4(ii,jj);
|
||||
} else {
|
||||
static_assert(false, "RN/RM values not supported");
|
||||
}
|
||||
}
|
||||
|
||||
template <int RM, int RN>
|
||||
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
kernel<RM, RN>(ii, jj);
|
||||
}
|
||||
}
|
||||
|
||||
const TA *const A;
|
||||
const TB *const B;
|
||||
TC *C;
|
||||
const int64_t k;
|
||||
const int64_t lda;
|
||||
const int64_t ldb;
|
||||
const int64_t ldc;
|
||||
const int ith;
|
||||
const int nth;
|
||||
};
|
||||
|
||||
template <typename TA, typename TB, typename TC>
|
||||
class tinyBLAS_Q0_PPC {
|
||||
public:
|
||||
@@ -2202,6 +2689,7 @@ class tinyBLAS_PPC {
|
||||
boffset = vec;
|
||||
j = (rows >> 3);
|
||||
if (j > 0) {
|
||||
|
||||
do {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
@@ -2875,9 +3363,22 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#elif defined(__MMA__)
|
||||
if ((k % 8))
|
||||
return false;
|
||||
if(Btype == GGML_TYPE_BF16) {
|
||||
tinyBLAS_BF16_PPC<ggml_bf16_t, ggml_bf16_t, float> tb{ k,
|
||||
(const ggml_bf16_t *)A, lda,
|
||||
(const ggml_bf16_t *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
case GGML_TYPE_F16: {
|
||||
#if defined(__AVX512F__)
|
||||
if (Btype == GGML_TYPE_F16) {
|
||||
|
||||
@@ -8,19 +8,6 @@
|
||||
|
||||
#include <float.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
|
||||
// disable POSIX deprecation warnings
|
||||
// these functions are never going away, anyway
|
||||
#pragma warning(disable: 4996)
|
||||
|
||||
// unreachable code because of multiple instances of code after GGML_ABORT
|
||||
#pragma warning(disable: 4702)
|
||||
#endif
|
||||
|
||||
// ggml_compute_forward_dup
|
||||
|
||||
static void ggml_compute_forward_dup_same_cont(
|
||||
@@ -4222,7 +4209,7 @@ static void ggml_compute_forward_get_rows_f16(
|
||||
|
||||
GGML_ASSERT(i01 >= 0 && i01 < ne01);
|
||||
|
||||
ggml_fp16_to_fp32_row(
|
||||
ggml_cpu_fp16_to_fp32(
|
||||
(const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
||||
}
|
||||
@@ -4263,7 +4250,7 @@ static void ggml_compute_forward_get_rows_bf16(
|
||||
|
||||
GGML_ASSERT(i01 >= 0 && i01 < ne01);
|
||||
|
||||
ggml_bf16_to_fp32_row(
|
||||
ggml_cpu_bf16_to_fp32(
|
||||
(const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
||||
}
|
||||
@@ -6064,6 +6051,178 @@ void ggml_compute_forward_conv_transpose_2d(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_conv_2d_dw
|
||||
|
||||
struct ggml_conv_2d_dw_params {
|
||||
int64_t channels;
|
||||
int64_t batch;
|
||||
int64_t src_w;
|
||||
int64_t src_h;
|
||||
int64_t dst_w;
|
||||
int64_t dst_h;
|
||||
int64_t knl_w;
|
||||
int64_t knl_h;
|
||||
int stride_x;
|
||||
int stride_y;
|
||||
int pad_x;
|
||||
int pad_y;
|
||||
int dilation_x;
|
||||
int dilation_y;
|
||||
};
|
||||
|
||||
static void ggml_compute_forward_conv_2d_dw_cwhn(
|
||||
const ggml_compute_params * params,
|
||||
const ggml_tensor * src,
|
||||
const ggml_tensor * kernel,
|
||||
ggml_tensor * dst,
|
||||
const ggml_conv_2d_dw_params & p) {
|
||||
|
||||
const int64_t c = p.channels;
|
||||
const float * knl_data = (const float *)kernel->data;
|
||||
|
||||
const int64_t rows_total = p.dst_h * p.batch;
|
||||
const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
|
||||
const int64_t row_start = params->ith * rows_per_thread;
|
||||
const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
|
||||
|
||||
#ifdef GGML_SIMD
|
||||
const int64_t pkg_size = GGML_F32_EPR;
|
||||
const int64_t pkg_count = c / pkg_size;
|
||||
const int64_t c_pkg_end = pkg_count * pkg_size;
|
||||
#else
|
||||
const int64_t c_pkg_end = 0;
|
||||
#endif
|
||||
|
||||
for (int64_t row = row_start; row < row_end; ++row) {
|
||||
const int64_t dst_y = row % p.dst_h;
|
||||
const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
|
||||
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
|
||||
float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
|
||||
const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
|
||||
const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
|
||||
|
||||
#ifdef GGML_SIMD
|
||||
// Vectorized loop
|
||||
for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
|
||||
GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
|
||||
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
|
||||
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
|
||||
if (src_y < 0 || src_y >= p.src_h) {
|
||||
continue;
|
||||
}
|
||||
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
|
||||
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
|
||||
if (src_x < 0 || src_x >= p.src_w) {
|
||||
continue;
|
||||
}
|
||||
GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
|
||||
GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
|
||||
sum = GGML_F32_VEC_FMA(sum, k, s);
|
||||
}
|
||||
}
|
||||
GGML_F32_VEC_STORE(dst_data + c_i, sum);
|
||||
}
|
||||
#endif
|
||||
// Scalar loop
|
||||
for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
|
||||
float sum = 0.0f;
|
||||
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
|
||||
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
|
||||
if (src_y < 0 || src_y >= p.src_h) {
|
||||
continue;
|
||||
}
|
||||
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
|
||||
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
|
||||
if (src_x < 0 || src_x >= p.src_w) {
|
||||
continue;
|
||||
}
|
||||
sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
|
||||
* src_data[(src_y * p.src_w + src_x) * c + c_i];
|
||||
}
|
||||
}
|
||||
dst_data[c_i] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_conv_2d_dw_whcn(
|
||||
const ggml_compute_params * params,
|
||||
const ggml_tensor * src,
|
||||
const ggml_tensor * kernel,
|
||||
ggml_tensor * dst,
|
||||
const ggml_conv_2d_dw_params & p) {
|
||||
|
||||
const int64_t n = p.channels * p.batch;
|
||||
const int64_t per_thread = (n + params->nth - 1) / params->nth;
|
||||
const int64_t start = params->ith * per_thread;
|
||||
const int64_t end = MIN(start + per_thread, n);
|
||||
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
|
||||
const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
|
||||
float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
|
||||
|
||||
for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
|
||||
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
|
||||
|
||||
float sum = 0.0f;
|
||||
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
|
||||
const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
|
||||
if (src_y < 0 || src_y >= p.src_h) {
|
||||
continue;
|
||||
}
|
||||
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
|
||||
const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
|
||||
if (src_x < 0 || src_x >= p.src_w) {
|
||||
continue;
|
||||
}
|
||||
sum += knl_data[knl_y * p.knl_w + knl_x]
|
||||
* src_data[src_y * p.src_w + src_x];
|
||||
}
|
||||
}
|
||||
dst_data[dst_y * p.dst_w + dst_x] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_conv_2d_dw(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * src = dst->src[1];
|
||||
ggml_conv_2d_dw_params p;
|
||||
p.channels = src->ne[2];
|
||||
p.batch = src->ne[3];
|
||||
p.src_w = src->ne[0];
|
||||
p.src_h = src->ne[1];
|
||||
p.dst_w = dst->ne[0];
|
||||
p.dst_h = dst->ne[1];
|
||||
p.knl_w = kernel->ne[0];
|
||||
p.knl_h = kernel->ne[1];
|
||||
p.stride_x = dst->op_params[0];
|
||||
p.stride_y = dst->op_params[1];
|
||||
p.pad_x = dst->op_params[2];
|
||||
p.pad_y = dst->op_params[3];
|
||||
p.dilation_x = dst->op_params[4];
|
||||
p.dilation_y = dst->op_params[5];
|
||||
|
||||
GGML_ASSERT(kernel->ne[3] == p.channels);
|
||||
GGML_ASSERT(dst->ne[3] == p.batch);
|
||||
|
||||
if (ggml_is_contiguous(src)) {
|
||||
ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p);
|
||||
} else if (ggml_is_contiguous_channels(src)) {
|
||||
// kernel should also have channels most contiguous in memory
|
||||
GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
|
||||
ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p);
|
||||
} else {
|
||||
GGML_ABORT("non-contiguous memory layout not supported");
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_pool_1d_sk_p0
|
||||
|
||||
static void ggml_compute_forward_pool_1d_sk_p0(
|
||||
|
||||
@@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
|
||||
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
@@ -341,7 +341,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
||||
#define GGML_F32_EPR 4
|
||||
|
||||
#define GGML_F32x4 vector float
|
||||
#define GGML_F32x4_ZERO 0.0f
|
||||
#define GGML_F32x4_ZERO {0.0f}
|
||||
#define GGML_F32x4_SET1 vec_splats
|
||||
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
||||
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
|
||||
|
||||
@@ -2,12 +2,6 @@
|
||||
|
||||
#include <cassert>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
#endif
|
||||
|
||||
// precomputed gelu table for f16 (128 KB)
|
||||
ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
||||
|
||||
|
||||
@@ -12,12 +12,30 @@ if (CUDAToolkit_FOUND)
|
||||
# 61 == Pascal, __dp4a instruction (per-byte integer dot product)
|
||||
# 70 == V100, FP16 tensor cores
|
||||
# 75 == Turing, int8 tensor cores
|
||||
# 80 == Ampere, asynchronous data loading, faster tensor core instructions
|
||||
# 86 == RTX 3000, needs CUDA v11.1
|
||||
# 89 == RTX 4000, needs CUDA v11.8
|
||||
#
|
||||
# XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
|
||||
# XX-real == compile CUDA code as device code for this specific architecture
|
||||
# no suffix == compile as both PTX and device code
|
||||
#
|
||||
# The default behavior for a non-native is to build virtual architectures as needed to cover all features needed
|
||||
# for best performance and to also build real architectures for the most commonly used GPUs.
|
||||
if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
|
||||
set(CMAKE_CUDA_ARCHITECTURES "native")
|
||||
elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75;80")
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
|
||||
endif()
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50;61;70;75;80")
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
@@ -100,7 +118,7 @@ if (CUDAToolkit_FOUND)
|
||||
|
||||
set(CUDA_CXX_FLAGS "")
|
||||
|
||||
set(CUDA_FLAGS -use_fast_math)
|
||||
set(CUDA_FLAGS -use_fast_math -extended-lambda)
|
||||
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
||||
# Options are:
|
||||
@@ -133,6 +151,7 @@ if (CUDAToolkit_FOUND)
|
||||
COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
|
||||
OUTPUT_VARIABLE CUDA_CCVER
|
||||
ERROR_QUIET
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
else()
|
||||
if (CUDA_CCFULLVER MATCHES Apple)
|
||||
@@ -143,7 +162,7 @@ if (CUDAToolkit_FOUND)
|
||||
string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
|
||||
endif()
|
||||
|
||||
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
|
||||
message(STATUS "CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
|
||||
|
||||
ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER})
|
||||
list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
|
||||
|
||||
@@ -78,13 +78,13 @@
|
||||
// Moore Threads
|
||||
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
|
||||
|
||||
#define GGML_CUDA_CC_QY1 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
||||
#define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
||||
#define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
||||
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
||||
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
||||
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
||||
|
||||
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
|
||||
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
|
||||
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT)
|
||||
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
|
||||
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
|
||||
|
||||
#ifdef __CUDA_ARCH_LIST__
|
||||
@@ -130,10 +130,6 @@ static int ggml_cuda_highest_compiled_arch(const int arch) {
|
||||
|
||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#define GGML_CUDA_MAX_STREAMS 8
|
||||
|
||||
[[noreturn]]
|
||||
@@ -300,6 +296,25 @@ static __device__ void no_device_code(
|
||||
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
|
||||
#endif // __CUDA_ARCH__
|
||||
|
||||
// The compiler is always able to unroll loops if they contain continue expressions.
|
||||
// In such cases loop unrolling can still be achieved via recursion:
|
||||
template <int n>
|
||||
struct ggml_cuda_unroll {
|
||||
template <typename Func, typename... Args>
|
||||
__device__ void operator()(const Func & f, Args... args) const {
|
||||
f(n - 1, args...);
|
||||
ggml_cuda_unroll<n - 1>{}(f, args...);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct ggml_cuda_unroll<1> {
|
||||
template <typename Func, typename... Args>
|
||||
__device__ void operator()(const Func & f, Args... args) const {
|
||||
f(0, args...);
|
||||
}
|
||||
};
|
||||
|
||||
template<int width = WARP_SIZE>
|
||||
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
#include "convert.cuh"
|
||||
#include "dequantize.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#define CUDA_Q8_0_NE_ALIGN 2048
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
@@ -570,30 +572,46 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
static __global__ void convert_unary(
|
||||
const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03) {
|
||||
const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i01 = blockIdx.y;
|
||||
const int64_t i02 = blockIdx.z % ne02;
|
||||
const int64_t i03 = blockIdx.z / ne02;
|
||||
|
||||
const src_t * x = (const src_t *) vx;
|
||||
|
||||
y[i] = float(x[i]);
|
||||
const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
|
||||
const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
|
||||
y[iy] = float(x[ix]);
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
static void convert_unary_cuda(const void * vx, dst_t * y,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
|
||||
const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, ne02*ne03);
|
||||
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
|
||||
(vx, y, ne00, ne01, ne02, s01, s02, s03);
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_cont_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
convert_unary_cuda<src_t>(vx, y, k, 1, 1, 1, k, k, k, stream);
|
||||
}
|
||||
|
||||
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
return convert_unary_cont_cuda<float>;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half>;
|
||||
return convert_unary_cont_cuda<half>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
@@ -643,9 +661,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
case GGML_TYPE_IQ3_S:
|
||||
return dequantize_row_iq3_s_cuda;
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
return convert_unary_cont_cuda<float>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cuda<nv_bfloat16>;
|
||||
return convert_unary_cont_cuda<nv_bfloat16>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
@@ -692,7 +710,18 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||
case GGML_TYPE_IQ3_S:
|
||||
return dequantize_row_iq3_s_cuda;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half>;
|
||||
return convert_unary_cont_cuda<half>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cont_cuda<nv_bfloat16>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cuda<nv_bfloat16>;
|
||||
default:
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||
|
||||
template<typename T>
|
||||
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
|
||||
using to_t_cuda_t = void (*)(const void * x, T * y, int64_t k, cudaStream_t stream);
|
||||
|
||||
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
||||
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
||||
@@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
|
||||
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type);
|
||||
|
||||
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
|
||||
|
||||
// TODO more general support for non-contiguous inputs
|
||||
|
||||
template<typename T>
|
||||
using to_t_nc_cuda_t = void (*)(const void * x, T * y,
|
||||
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
|
||||
int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream);
|
||||
|
||||
typedef to_t_nc_cuda_t<half> to_fp16_nc_cuda_t;
|
||||
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user