mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-23 16:37:33 +03:00
Compare commits
120 Commits
b1218
...
custom-att
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c5650ed470 | ||
|
|
ce2d995af2 | ||
|
|
2b8830af71 | ||
|
|
a207561503 | ||
|
|
d008733e6b | ||
|
|
4c72ab13b2 | ||
|
|
e9463792d3 | ||
|
|
4ad0676927 | ||
|
|
25856900db | ||
|
|
4aea3b846e | ||
|
|
da0400344b | ||
|
|
e519621010 | ||
|
|
ac43576124 | ||
|
|
20c7e1e804 | ||
|
|
dc6897404e | ||
|
|
527e57cfd8 | ||
|
|
ffe88a36a9 | ||
|
|
c1596f633f | ||
|
|
99115f3fa6 | ||
|
|
1726f9626f | ||
|
|
a98b1633d5 | ||
|
|
c091cdfb24 | ||
|
|
51a7cf5c6e | ||
|
|
bedb92b603 | ||
|
|
bc9d3e3971 | ||
|
|
36b904e200 | ||
|
|
8845160058 | ||
|
|
5a3369d8e8 | ||
|
|
324f3403d5 | ||
|
|
f56c418ab0 | ||
|
|
8185710a80 | ||
|
|
7eb41179ed | ||
|
|
b2debf65f2 | ||
|
|
a5661d7e71 | ||
|
|
ded9b43cad | ||
|
|
65c2c1c5ab | ||
|
|
ee1d670cc6 | ||
|
|
80834daecf | ||
|
|
1be2b8c19b | ||
|
|
a40f2b656f | ||
|
|
2f3a46fccf | ||
|
|
54206962c7 | ||
|
|
e04dc51988 | ||
|
|
db0fc2da06 | ||
|
|
b377bf2266 | ||
|
|
addae65fd4 | ||
|
|
d119c04c15 | ||
|
|
a1327c71c6 | ||
|
|
e1067efbfa | ||
|
|
7b7472ee26 | ||
|
|
6028879f56 | ||
|
|
eed3fd4234 | ||
|
|
8a9aca37c1 | ||
|
|
4b5f3cd6bf | ||
|
|
82e20e9ba0 | ||
|
|
d37081ae5d | ||
|
|
16090a5dde | ||
|
|
806d397c1a | ||
|
|
ddad227782 | ||
|
|
36714e16d0 | ||
|
|
467e307931 | ||
|
|
25bd254089 | ||
|
|
7e2b9974d1 | ||
|
|
daf4c6d360 | ||
|
|
fa0e677820 | ||
|
|
897caccdf4 | ||
|
|
466b513851 | ||
|
|
0161372b9a | ||
|
|
1f17ea631c | ||
|
|
7c1bdd0e8a | ||
|
|
0cbf3bfef8 | ||
|
|
86c90e34f5 | ||
|
|
f015b26689 | ||
|
|
8781013ef6 | ||
|
|
4d76d762ef | ||
|
|
6952a460b9 | ||
|
|
9f42e75489 | ||
|
|
58bb5110ca | ||
|
|
d29e76937c | ||
|
|
7ddf185537 | ||
|
|
ee66942d7e | ||
|
|
fad56936d4 | ||
|
|
1fb033fd85 | ||
|
|
3b4bab6a38 | ||
|
|
c5df72e848 | ||
|
|
111163e246 | ||
|
|
8b428c9bc8 | ||
|
|
578d8c8f5c | ||
|
|
b541b4f0b1 | ||
|
|
5dbc2b3213 | ||
|
|
b08e75baea | ||
|
|
e6616cf0db | ||
|
|
3aefaab9e5 | ||
|
|
69eb67e282 | ||
|
|
4fe09dfe66 | ||
|
|
80291a1d02 | ||
|
|
c6f1491da0 | ||
|
|
e3d87a6c36 | ||
|
|
8c00b7a6ff | ||
|
|
7e50d34be6 | ||
|
|
235f7c193b | ||
|
|
a51b687657 | ||
|
|
76164fe2e6 | ||
|
|
c2ab6fe661 | ||
|
|
2d770505a8 | ||
|
|
98311c4277 | ||
|
|
feea179e9f | ||
|
|
769266a543 | ||
|
|
cf8238e7f4 | ||
|
|
4b8560e72a | ||
|
|
83a53b753a | ||
|
|
5c872dbca2 | ||
|
|
990a5e226a | ||
|
|
980ab41afb | ||
|
|
e394084166 | ||
|
|
4c8643dd6e | ||
|
|
35f73049af | ||
|
|
71ca2fad7d | ||
|
|
1b6c650d16 | ||
|
|
0a5eebb45d |
22
.devops/cloud-v-pipeline
Normal file
22
.devops/cloud-v-pipeline
Normal file
@@ -0,0 +1,22 @@
|
||||
node('x86_runner1'){ // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
|
||||
stage('Cleanup'){
|
||||
cleanWs() // Cleaning previous CI build in workspace
|
||||
}
|
||||
stage('checkout repo'){
|
||||
retry(5){ // Retry if the cloning fails due to some reason
|
||||
checkout scm // Clone the repo on Runner
|
||||
}
|
||||
}
|
||||
stage('Compiling llama.cpp'){
|
||||
sh'''#!/bin/bash
|
||||
make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
|
||||
'''
|
||||
}
|
||||
stage('Running llama.cpp'){
|
||||
sh'''#!/bin/bash
|
||||
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
||||
cat llama_log.txt # Printing results
|
||||
'''
|
||||
}
|
||||
}
|
||||
95
.github/workflows/build.yml
vendored
95
.github/workflows/build.yml
vendored
@@ -27,7 +27,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -52,7 +52,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -87,7 +87,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -121,7 +121,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -149,7 +149,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -174,7 +174,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -265,22 +265,24 @@ jobs:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'noavx'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx2'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx512'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'clblast'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
- build: 'openblas'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Download OpenCL SDK
|
||||
id: get_opencl
|
||||
@@ -390,27 +392,29 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.1.0', '11.7.1']
|
||||
cuda: ['12.2.0', '11.7.1']
|
||||
build: ['cublas']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- uses: Jimver/cuda-toolkit@v0.2.10
|
||||
- uses: Jimver/cuda-toolkit@v0.2.11
|
||||
id: cuda-toolkit
|
||||
with:
|
||||
cuda: ${{ matrix.cuda }}
|
||||
# TODO(green-sky): _dev seems to fail, and non dev are not enought
|
||||
#sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'
|
||||
method: 'network'
|
||||
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
|
||||
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
|
||||
cmake --build . --config Release
|
||||
|
||||
- name: Determine tag name
|
||||
@@ -440,27 +444,11 @@ jobs:
|
||||
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
if: ${{ matrix.cuda == '12.1.0' }}
|
||||
# TODO(green-sky): paths are cuda 12 specific
|
||||
run: |
|
||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
||||
mkdir '.\build\bin\cudart\'
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\'
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\'
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
|
||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
if: ${{ matrix.cuda == '11.7.1' }}
|
||||
# TODO(green-sky): paths are cuda 11 specific
|
||||
run: |
|
||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
||||
mkdir '.\build\bin\cudart\'
|
||||
ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
|
||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
|
||||
$dst='.\build\bin\cudart\'
|
||||
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
@@ -469,6 +457,23 @@ jobs:
|
||||
path: |
|
||||
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
freeBSD-latest:
|
||||
runs-on: macos-12
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Build
|
||||
uses: cross-platform-actions/action@v0.19.0
|
||||
with:
|
||||
operating_system: freebsd
|
||||
version: '13.2'
|
||||
hypervisor: 'qemu'
|
||||
run: |
|
||||
sudo pkg update
|
||||
sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
|
||||
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
|
||||
@@ -485,7 +490,9 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
@@ -543,7 +550,7 @@ jobs:
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v1
|
||||
# uses: actions/checkout@v3
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
@@ -567,7 +574,7 @@ jobs:
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v1
|
||||
# uses: actions/checkout@v3
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
@@ -591,7 +598,7 @@ jobs:
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v1
|
||||
# uses: actions/checkout@v3
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
@@ -621,7 +628,7 @@ jobs:
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v1
|
||||
# uses: actions/checkout@v3
|
||||
#
|
||||
# - name: Add msbuild to PATH
|
||||
# uses: microsoft/setup-msbuild@v1
|
||||
@@ -660,7 +667,7 @@ jobs:
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v1
|
||||
# uses: actions/checkout@v3
|
||||
#
|
||||
# - name: Add msbuild to PATH
|
||||
# uses: microsoft/setup-msbuild@v1
|
||||
@@ -706,7 +713,7 @@ jobs:
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v1
|
||||
# uses: actions/checkout@v3
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
|
||||
15
.github/workflows/docker.yml
vendored
15
.github/workflows/docker.yml
vendored
@@ -26,8 +26,15 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
config:
|
||||
- { tag: "light", dockerfile: ".devops/main.Dockerfile" }
|
||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile" }
|
||||
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
||||
# have disabled them for now until the reason why
|
||||
# is understood.
|
||||
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v3
|
||||
@@ -51,7 +58,7 @@ jobs:
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: linux/amd64,linux/arm64
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
|
||||
@@ -60,6 +67,6 @@ jobs:
|
||||
with:
|
||||
context: .
|
||||
push: ${{ github.event_name == 'push' }}
|
||||
platforms: linux/amd64,linux/arm64
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
|
||||
2
.github/workflows/gguf-publish.yml
vendored
2
.github/workflows/gguf-publish.yml
vendored
@@ -24,7 +24,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -51,7 +51,9 @@ models-mnt
|
||||
/save-load-state
|
||||
/server
|
||||
/simple
|
||||
/batched
|
||||
/speculative
|
||||
/parallel
|
||||
/train-text-from-scratch
|
||||
/vdot
|
||||
build-info.h
|
||||
|
||||
@@ -80,6 +80,8 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
|
||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||
"llama: max. batch size for using peer access")
|
||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||
@@ -116,7 +118,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
|
||||
COMMENT "Generating build details from Git"
|
||||
COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
|
||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
DEPENDS "${GIT_DIR}/index"
|
||||
VERBATIM
|
||||
@@ -135,6 +137,7 @@ set(CMAKE_C_STANDARD 11)
|
||||
set(CMAKE_C_STANDARD_REQUIRED true)
|
||||
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||
find_package(Threads REQUIRED)
|
||||
include(CheckCXXCompilerFlag)
|
||||
|
||||
if (NOT MSVC)
|
||||
if (LLAMA_SANITIZE_THREAD)
|
||||
@@ -159,6 +162,8 @@ if (APPLE AND LLAMA_ACCELERATE)
|
||||
message(STATUS "Accelerate framework found")
|
||||
|
||||
add_compile_definitions(GGML_USE_ACCELERATE)
|
||||
add_compile_definitions(ACCELERATE_NEW_LAPACK)
|
||||
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
|
||||
else()
|
||||
message(WARNING "Accelerate framework not found")
|
||||
@@ -171,8 +176,8 @@ if (LLAMA_METAL)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
|
||||
message(STATUS "Metal framework found")
|
||||
|
||||
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
|
||||
set(GGML_HEADERS_METAL ggml-metal.h)
|
||||
set(GGML_SOURCES_METAL ggml-metal.m)
|
||||
|
||||
add_compile_definitions(GGML_USE_METAL)
|
||||
if (LLAMA_METAL_NDEBUG)
|
||||
@@ -191,7 +196,6 @@ if (LLAMA_METAL)
|
||||
${METALKIT_FRAMEWORK}
|
||||
)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BLAS)
|
||||
if (LLAMA_STATIC)
|
||||
set(BLA_STATIC ON)
|
||||
@@ -268,7 +272,8 @@ if (LLAMA_BLAS)
|
||||
endif()
|
||||
|
||||
if (LLAMA_K_QUANTS)
|
||||
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
|
||||
set(GGML_HEADERS_EXTRA k_quants.h)
|
||||
set(GGML_SOURCES_EXTRA k_quants.c)
|
||||
add_compile_definitions(GGML_USE_K_QUANTS)
|
||||
if (LLAMA_QKK_64)
|
||||
add_compile_definitions(GGML_QKK_64)
|
||||
@@ -284,7 +289,8 @@ if (LLAMA_CUBLAS)
|
||||
|
||||
enable_language(CUDA)
|
||||
|
||||
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
|
||||
set(GGML_HEADERS_CUDA ggml-cuda.h)
|
||||
set(GGML_SOURCES_CUDA ggml-cuda.cu)
|
||||
|
||||
add_compile_definitions(GGML_USE_CUBLAS)
|
||||
# if (LLAMA_CUDA_CUBLAS)
|
||||
@@ -302,6 +308,7 @@ if (LLAMA_CUBLAS)
|
||||
add_compile_definitions(GGML_CUDA_F16)
|
||||
endif()
|
||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
|
||||
if (LLAMA_STATIC)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
@@ -332,6 +339,7 @@ if (LLAMA_MPI)
|
||||
find_package(MPI)
|
||||
if (MPI_C_FOUND)
|
||||
message(STATUS "MPI found")
|
||||
set(GGML_HEADERS_MPI ggml-mpi.h)
|
||||
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
|
||||
add_compile_definitions(GGML_USE_MPI)
|
||||
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
||||
@@ -354,7 +362,8 @@ if (LLAMA_CLBLAST)
|
||||
if (CLBlast_FOUND)
|
||||
message(STATUS "CLBlast found")
|
||||
|
||||
set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
|
||||
set(GGML_HEADERS_OPENCL ggml-opencl.h)
|
||||
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
|
||||
|
||||
add_compile_definitions(GGML_USE_CLBLAST)
|
||||
|
||||
@@ -382,13 +391,15 @@ if (LLAMA_HIPBLAS)
|
||||
message(STATUS "HIP and hipBLAS found")
|
||||
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
|
||||
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
if (LLAMA_CUDA_FORCE_DMMV)
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
||||
endif()
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
|
||||
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
|
||||
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||
|
||||
@@ -421,6 +432,7 @@ if (LLAMA_ALL_WARNINGS)
|
||||
-Wextra
|
||||
-Wpedantic
|
||||
-Wcast-qual
|
||||
-Wmissing-declarations
|
||||
-Wno-unused-function
|
||||
-Wno-multichar
|
||||
)
|
||||
@@ -439,7 +451,7 @@ if (LLAMA_ALL_WARNINGS)
|
||||
|
||||
endif()
|
||||
|
||||
if (MSVC)
|
||||
if (WIN32)
|
||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
@@ -492,17 +504,21 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
|
||||
# add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
|
||||
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
|
||||
else()
|
||||
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
||||
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
||||
add_compile_options(-mfp16-format=ieee)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
||||
# Raspberry Pi 1, Zero
|
||||
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)
|
||||
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
||||
# Raspberry Pi 2
|
||||
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations)
|
||||
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
add_compile_options(-mfp16-format=ieee -mno-unaligned-access)
|
||||
add_compile_options(-mno-unaligned-access)
|
||||
endif()
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
|
||||
@@ -627,11 +643,11 @@ add_library(ggml OBJECT
|
||||
ggml.h
|
||||
ggml-alloc.c
|
||||
ggml-alloc.h
|
||||
${GGML_SOURCES_CUDA}
|
||||
${GGML_SOURCES_OPENCL}
|
||||
${GGML_SOURCES_METAL}
|
||||
${GGML_SOURCES_MPI}
|
||||
${GGML_SOURCES_EXTRA}
|
||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||
)
|
||||
|
||||
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||
@@ -669,14 +685,53 @@ if (BUILD_SHARED_LIBS)
|
||||
if (LLAMA_METAL)
|
||||
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
|
||||
endif()
|
||||
install(TARGETS llama LIBRARY)
|
||||
endif()
|
||||
|
||||
|
||||
#
|
||||
# install
|
||||
#
|
||||
|
||||
include(GNUInstallDirs)
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}
|
||||
CACHE PATH "Location of header files")
|
||||
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}
|
||||
CACHE PATH "Location of library files")
|
||||
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
|
||||
CACHE PATH "Location of binary files")
|
||||
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
||||
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
||||
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
||||
|
||||
configure_package_config_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
||||
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama
|
||||
PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
|
||||
LLAMA_LIB_INSTALL_DIR
|
||||
LLAMA_BIN_INSTALL_DIR )
|
||||
|
||||
write_basic_package_version_file(
|
||||
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
|
||||
VERSION ${LLAMA_INSTALL_VERSION}
|
||||
COMPATIBILITY SameMajorVersion)
|
||||
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
|
||||
|
||||
set(GGML_PUBLIC_HEADERS "ggml.h"
|
||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
||||
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
install(TARGETS ggml PUBLIC_HEADER)
|
||||
|
||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h)
|
||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||
|
||||
install(
|
||||
FILES convert.py
|
||||
PERMISSIONS
|
||||
|
||||
109
Makefile
109
Makefile
@@ -1,8 +1,8 @@
|
||||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
|
||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative parallel tests/test-c.o
|
||||
|
||||
# Binaries only useful for tests
|
||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
|
||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
|
||||
|
||||
# Code coverage output files
|
||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||
@@ -49,7 +49,7 @@ test: $(TEST_TARGETS)
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
|
||||
continue; \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
|
||||
continue; \
|
||||
else \
|
||||
echo "Running test $$test_target..."; \
|
||||
@@ -95,65 +95,60 @@ CXXV := $(shell $(CXX) --version | head -n 1)
|
||||
#
|
||||
|
||||
# keep standard at C11 and C++11
|
||||
MK_CPPFLAGS = -I. -Icommon
|
||||
MK_CFLAGS = -std=c11 -fPIC
|
||||
MK_CXXFLAGS = -std=c++11 -fPIC
|
||||
|
||||
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
||||
ifdef LLAMA_FAST
|
||||
OPT = -Ofast
|
||||
MK_CFLAGS += -Ofast
|
||||
MK_HOST_CXXFLAGS += -Ofast
|
||||
MK_CUDA_CXXFLAGS += -O3
|
||||
else
|
||||
OPT = -O3
|
||||
MK_CFLAGS += -O3
|
||||
MK_CXXFLAGS += -O3
|
||||
endif
|
||||
MK_CPPFLAGS = -I. -Icommon
|
||||
MK_CFLAGS = $(OPT) -std=c11 -fPIC
|
||||
MK_CXXFLAGS = $(OPT) -std=c++11 -fPIC
|
||||
MK_LDFLAGS =
|
||||
|
||||
# clock_gettime came in POSIX.1b (1993)
|
||||
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
||||
# posix_memalign came in POSIX.1-2001 / SUSv3
|
||||
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
|
||||
MK_CFLAGS += -D_XOPEN_SOURCE=600
|
||||
MK_CXXFLAGS += -D_XOPEN_SOURCE=600
|
||||
MK_CPPFLAGS += -D_XOPEN_SOURCE=600
|
||||
|
||||
# Somehow in OpenBSD whenever POSIX conformance is specified
|
||||
# some string functions rely on locale_t availability,
|
||||
# which was introduced in POSIX.1-2008, forcing us to go higher
|
||||
ifeq ($(UNAME_S),OpenBSD)
|
||||
MK_CFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
||||
MK_CXXFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
||||
MK_CPPFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
||||
endif
|
||||
|
||||
# Data types, macros and functions related to controlling CPU affinity and
|
||||
# some memory allocation are available on Linux through GNU extensions in libc
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
MK_CFLAGS += -D_GNU_SOURCE
|
||||
MK_CXXFLAGS += -D_GNU_SOURCE
|
||||
MK_CPPFLAGS += -D_GNU_SOURCE
|
||||
endif
|
||||
|
||||
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
||||
# and on macOS its availability depends on enabling Darwin extensions
|
||||
# similarly on DragonFly, enabling BSD extensions is necessary
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
MK_CFLAGS += -D_DARWIN_C_SOURCE
|
||||
MK_CXXFLAGS += -D_DARWIN_C_SOURCE
|
||||
MK_CPPFLAGS += -D_DARWIN_C_SOURCE
|
||||
endif
|
||||
ifeq ($(UNAME_S),DragonFly)
|
||||
MK_CFLAGS += -D__BSD_VISIBLE
|
||||
MK_CXXFLAGS += -D__BSD_VISIBLE
|
||||
MK_CPPFLAGS += -D__BSD_VISIBLE
|
||||
endif
|
||||
|
||||
# alloca is a non-standard interface that is not visible on BSDs when
|
||||
# POSIX conformance is specified, but not all of them provide a clean way
|
||||
# to enable it in such cases
|
||||
ifeq ($(UNAME_S),FreeBSD)
|
||||
MK_CFLAGS += -D__BSD_VISIBLE
|
||||
MK_CXXFLAGS += -D__BSD_VISIBLE
|
||||
MK_CPPFLAGS += -D__BSD_VISIBLE
|
||||
endif
|
||||
ifeq ($(UNAME_S),NetBSD)
|
||||
MK_CFLAGS += -D_NETBSD_SOURCE
|
||||
MK_CXXFLAGS += -D_NETBSD_SOURCE
|
||||
MK_CPPFLAGS += -D_NETBSD_SOURCE
|
||||
endif
|
||||
ifeq ($(UNAME_S),OpenBSD)
|
||||
MK_CFLAGS += -D_BSD_SOURCE
|
||||
MK_CXXFLAGS += -D_BSD_SOURCE
|
||||
MK_CPPFLAGS += -D_BSD_SOURCE
|
||||
endif
|
||||
|
||||
ifdef LLAMA_DEBUG
|
||||
@@ -180,9 +175,16 @@ endif # LLAMA_DISABLE_LOGS
|
||||
# warnings
|
||||
MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
|
||||
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
|
||||
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
|
||||
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wmissing-declarations -Wno-unused-function -Wno-multichar
|
||||
|
||||
ifeq '' '$(findstring clang++,$(CXX))'
|
||||
# TODO(cebtenzzre): remove this once PR #2632 gets merged
|
||||
TTFS_CXXFLAGS = $(CXXFLAGS) -Wno-missing-declarations
|
||||
|
||||
ifneq '' '$(findstring clang,$(shell $(CXX) --version))'
|
||||
# clang++ only
|
||||
MK_CXXFLAGS += -Wmissing-prototypes
|
||||
TTFS_CXXFLAGS += -Wno-missing-prototypes
|
||||
else
|
||||
# g++ only
|
||||
MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
|
||||
endif
|
||||
@@ -233,7 +235,7 @@ ifndef RISCV
|
||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
||||
# Use all CPU extensions that are available:
|
||||
MK_CFLAGS += -march=native -mtune=native
|
||||
MK_CXXFLAGS += -march=native -mtune=native
|
||||
MK_HOST_CXXFLAGS += -march=native -mtune=native
|
||||
|
||||
# Usage AVX-only
|
||||
#MK_CFLAGS += -mfma -mf16c -mavx
|
||||
@@ -303,6 +305,8 @@ ifndef LLAMA_NO_ACCELERATE
|
||||
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
MK_CPPFLAGS += -DGGML_USE_ACCELERATE
|
||||
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
|
||||
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
|
||||
MK_LDFLAGS += -framework Accelerate
|
||||
endif
|
||||
endif # LLAMA_NO_ACCELERATE
|
||||
@@ -366,6 +370,11 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
|
||||
else
|
||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
||||
endif
|
||||
ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
||||
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
|
||||
else
|
||||
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
||||
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
||||
#ifdef LLAMA_CUDA_CUBLAS
|
||||
# NVCCFLAGS += -DGGML_CUDA_CUBLAS
|
||||
#endif # LLAMA_CUDA_CUBLAS
|
||||
@@ -373,7 +382,7 @@ ifdef LLAMA_CUDA_CCBIN
|
||||
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
||||
endif
|
||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@
|
||||
$(NVCC) $(NVCCFLAGS) -Wno-pedantic -c $< -o $@
|
||||
endif # LLAMA_CUBLAS
|
||||
|
||||
ifdef LLAMA_CLBLAST
|
||||
@@ -408,7 +417,6 @@ ifdef LLAMA_HIPBLAS
|
||||
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
||||
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
||||
HIPFLAGS += -DCC_TURING=1000000000
|
||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||
endif # LLAMA_CUDA_FORCE_DMMV
|
||||
@@ -442,23 +450,30 @@ k_quants.o: k_quants.c k_quants.h
|
||||
endif # LLAMA_NO_K_QUANTS
|
||||
|
||||
# combine build flags with cmdline overrides
|
||||
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
|
||||
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
|
||||
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
||||
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
|
||||
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
|
||||
override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS)
|
||||
override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS)
|
||||
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
||||
|
||||
# save CXXFLAGS before we add host-only options
|
||||
NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)"
|
||||
override CXXFLAGS += $(HOST_CXXFLAGS)
|
||||
|
||||
#
|
||||
# Print build information
|
||||
#
|
||||
|
||||
$(info I llama.cpp build info: )
|
||||
$(info I UNAME_S: $(UNAME_S))
|
||||
$(info I UNAME_P: $(UNAME_P))
|
||||
$(info I UNAME_M: $(UNAME_M))
|
||||
$(info I CFLAGS: $(CFLAGS))
|
||||
$(info I CXXFLAGS: $(CXXFLAGS))
|
||||
$(info I LDFLAGS: $(LDFLAGS))
|
||||
$(info I CC: $(CCV))
|
||||
$(info I CXX: $(CXXV))
|
||||
$(info I UNAME_S: $(UNAME_S))
|
||||
$(info I UNAME_P: $(UNAME_P))
|
||||
$(info I UNAME_M: $(UNAME_M))
|
||||
$(info I CFLAGS: $(CFLAGS))
|
||||
$(info I CXXFLAGS: $(CXXFLAGS))
|
||||
$(info I NVCCFLAGS: $(NVCCFLAGS))
|
||||
$(info I LDFLAGS: $(LDFLAGS))
|
||||
$(info I CC: $(CCV))
|
||||
$(info I CXX: $(CXXV))
|
||||
$(info )
|
||||
|
||||
#
|
||||
@@ -504,6 +519,9 @@ main: examples/main/main.cpp build-info.h ggml.
|
||||
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
batched: examples/batched/batched.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
@@ -533,7 +551,7 @@ gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
$(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
@@ -550,13 +568,16 @@ beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o co
|
||||
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
endif
|
||||
|
||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||
@sh scripts/build-info.sh > $@.tmp
|
||||
@sh scripts/build-info.sh $(CC) > $@.tmp
|
||||
@if ! cmp -s $@.tmp $@; then \
|
||||
mv $@.tmp $@; \
|
||||
else \
|
||||
@@ -606,7 +627,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
|
||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-1: tests/test-tokenizer-1.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-c.o: tests/test-c.c llama.h
|
||||
|
||||
@@ -45,6 +45,8 @@ let package = Package(
|
||||
.unsafeFlags(["-Wno-shorten-64-to-32"]),
|
||||
.define("GGML_USE_K_QUANTS"),
|
||||
.define("GGML_USE_ACCELERATE")
|
||||
.define("ACCELERATE_NEW_LAPACK")
|
||||
.define("ACCELERATE_LAPACK_ILP64")
|
||||
] + additionalSettings,
|
||||
linkerSettings: [
|
||||
.linkedFramework("Accelerate")
|
||||
|
||||
42
README.md
42
README.md
@@ -11,6 +11,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
### Hot topics
|
||||
|
||||
- Parallel decoding + continuous batching support incoming: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
|
||||
**Devs should become familiar with the new API**
|
||||
- Local Falcon 180B inference on Mac Studio
|
||||
|
||||
https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e
|
||||
@@ -90,6 +92,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
|
||||
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
|
||||
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
|
||||
- [X] Mistral AI v0.1
|
||||
|
||||
**Bindings:**
|
||||
|
||||
@@ -391,13 +394,14 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
<!---
|
||||
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
|
||||
--->
|
||||
| Option | Legal values | Default | Description |
|
||||
|-------------------------|------------------------|---------|-------------|
|
||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
||||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||
| Option | Legal values | Default | Description |
|
||||
|--------------------------------|------------------------|---------|-------------|
|
||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
||||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
||||
|
||||
- #### hipBLAS
|
||||
|
||||
@@ -498,7 +502,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
```sh
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
|
||||
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
cmake --build . --config Release
|
||||
```
|
||||
- CMake (Windows):
|
||||
@@ -554,6 +558,10 @@ python3 convert.py models/7B/
|
||||
# quantize the model to 4-bits (using q4_0 method)
|
||||
./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
|
||||
|
||||
# update the gguf filetype to current if older version is unsupported by another application
|
||||
./quantize ./models/7B/ggml-model-q4_0.gguf ./models/7B/ggml-model-q4_0-v2.gguf COPY
|
||||
|
||||
|
||||
# run the inference
|
||||
./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
|
||||
```
|
||||
@@ -590,6 +598,11 @@ Several quantization methods are supported. They differ in the resulting model d
|
||||
| 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 |
|
||||
| 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
|
||||
|
||||
- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
|
||||
- recent k-quants improvements
|
||||
- [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
|
||||
- [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
|
||||
|
||||
### Perplexity (measuring model quality)
|
||||
|
||||
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
|
||||
@@ -844,8 +857,17 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
|
||||
#### Images
|
||||
We have two Docker images available for this project:
|
||||
|
||||
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
|
||||
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
|
||||
Additionally, there the following images, similar to the above:
|
||||
|
||||
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
|
||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
||||
|
||||
#### Usage
|
||||
|
||||
|
||||
22
build.zig
22
build.zig
@@ -36,17 +36,20 @@ const Maker = struct {
|
||||
}
|
||||
|
||||
fn init(builder: *std.build.Builder) !Maker {
|
||||
const commit_hash = @embedFile(".git/refs/heads/master");
|
||||
// const commit_hash = @embedFile(".git/refs/heads/master");
|
||||
const target = builder.standardTargetOptions(.{});
|
||||
const config_header = builder.addConfigHeader(
|
||||
.{ .style = .blank, .include_path = "build-info.h" },
|
||||
.{
|
||||
.BUILD_NUMBER = 0,
|
||||
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
|
||||
.BUILD_COMMIT = "12345", // omit newline
|
||||
.BUILD_COMPILER = "Zig 0.11.0",
|
||||
.BUILD_TARGET = try target.allocDescription(builder.allocator),
|
||||
},
|
||||
);
|
||||
var m = Maker{
|
||||
.builder = builder,
|
||||
.target = builder.standardTargetOptions(.{}),
|
||||
.target = target,
|
||||
.optimize = builder.standardOptimizeOption(.{}),
|
||||
.config_header = config_header,
|
||||
.enable_lto = false,
|
||||
@@ -58,7 +61,7 @@ const Maker = struct {
|
||||
try m.addCFlag("-std=c11");
|
||||
try m.addCxxFlag("-std=c++11");
|
||||
try m.addProjectInclude(&.{});
|
||||
try m.addProjectInclude(&.{"examples"});
|
||||
try m.addProjectInclude(&.{"common"});
|
||||
return m;
|
||||
}
|
||||
|
||||
@@ -71,6 +74,7 @@ const Maker = struct {
|
||||
o.addCSourceFiles(&.{src}, m.cxxflags.items);
|
||||
o.linkLibCpp();
|
||||
}
|
||||
o.addConfigHeader(m.config_header);
|
||||
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
||||
o.want_lto = m.enable_lto;
|
||||
return o;
|
||||
@@ -104,15 +108,15 @@ pub fn build(b: *std.build.Builder) !void {
|
||||
const ggml = make.obj("ggml", "ggml.c");
|
||||
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
||||
const llama = make.obj("llama", "llama.cpp");
|
||||
const common = make.obj("common", "examples/common.cpp");
|
||||
const console = make.obj("common", "examples/console.cpp");
|
||||
const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
|
||||
const common = make.obj("common", "common/common.cpp");
|
||||
const console = make.obj("common", "common/console.cpp");
|
||||
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
||||
|
||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common });
|
||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
|
||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common });
|
||||
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
|
||||
if (server.target.isWindows()) {
|
||||
|
||||
@@ -78,7 +78,7 @@ int32_t get_num_physical_cores() {
|
||||
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
||||
}
|
||||
|
||||
void process_escapes(std::string& input) {
|
||||
static void process_escapes(std::string& input) {
|
||||
std::size_t input_len = input.length();
|
||||
std::size_t output_idx = 0;
|
||||
|
||||
@@ -317,6 +317,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.n_chunks = std::stoi(argv[i]);
|
||||
} else if (arg == "-np" || arg == "--parallel") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_parallel = std::stoi(argv[i]);
|
||||
} else if (arg == "-ns" || arg == "--sequences") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_sequences = std::stoi(argv[i]);
|
||||
} else if (arg == "-m" || arg == "--model") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -360,6 +372,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
params.multiline_input = true;
|
||||
} else if (arg == "--simple-io") {
|
||||
params.simple_io = true;
|
||||
} else if (arg == "-cb" || arg == "--cont-batching") {
|
||||
params.cont_batching = true;
|
||||
} else if (arg == "--color") {
|
||||
params.use_color = true;
|
||||
} else if (arg == "--mlock") {
|
||||
@@ -434,12 +448,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
#endif // GGML_USE_CUBLAS
|
||||
} else if (arg == "--no-mmap") {
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--mtest") {
|
||||
params.mem_test = true;
|
||||
} else if (arg == "--numa") {
|
||||
params.numa = true;
|
||||
} else if (arg == "--export") {
|
||||
params.export_cgraph = true;
|
||||
} else if (arg == "--verbose-prompt") {
|
||||
params.verbose_prompt = true;
|
||||
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
||||
@@ -458,8 +468,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
|
||||
params.logdir += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
} else if (arg == "--perplexity") {
|
||||
params.perplexity = true;
|
||||
} else if (arg == "--perplexity" || arg == "--all-logits") {
|
||||
params.logits_all = true;
|
||||
} else if (arg == "--ppl-stride") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -649,20 +659,23 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" --cfg-negative-prompt-file FNAME\n");
|
||||
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
||||
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
||||
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
|
||||
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
|
||||
printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
|
||||
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
||||
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
||||
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
|
||||
printf(" --perplexity compute perplexity over each ctx window of the prompt\n");
|
||||
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
||||
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
||||
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
|
||||
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
|
||||
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
|
||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||
if (llama_mlock_supported()) {
|
||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
}
|
||||
@@ -687,8 +700,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" Not recommended since this is both slower and uses more VRAM.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#endif
|
||||
printf(" --mtest compute maximum memory usage\n");
|
||||
printf(" --export export the computation graph to 'llama.ggml'\n");
|
||||
printf(" --verbose-prompt print prompt before generation\n");
|
||||
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
@@ -741,7 +752,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||
lparams.f16_kv = params.memory_f16;
|
||||
lparams.use_mmap = params.use_mmap;
|
||||
lparams.use_mlock = params.use_mlock;
|
||||
lparams.logits_all = params.perplexity;
|
||||
lparams.logits_all = params.logits_all;
|
||||
lparams.embedding = params.embedding;
|
||||
lparams.rope_freq_base = params.rope_freq_base;
|
||||
lparams.rope_freq_scale = params.rope_freq_scale;
|
||||
@@ -785,8 +796,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||
{
|
||||
LOG("warming up the model with an empty run\n");
|
||||
|
||||
const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
|
||||
llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
|
||||
std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
|
||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0), params.n_threads);
|
||||
llama_kv_cache_tokens_rm(lctx, -1, -1);
|
||||
llama_reset_timings(lctx);
|
||||
}
|
||||
|
||||
@@ -804,10 +816,10 @@ std::vector<llama_token> llama_tokenize(
|
||||
// upper limit for the number of tokens
|
||||
int n_tokens = text.length() + add_bos;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
|
||||
n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
|
||||
int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
@@ -893,7 +905,7 @@ llama_token llama_sample_token(
|
||||
|
||||
llama_token id = 0;
|
||||
|
||||
float * logits = llama_get_logits(ctx) + idx * n_vocab;
|
||||
float * logits = llama_get_logits_ith(ctx, idx);
|
||||
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
||||
@@ -944,11 +956,11 @@ llama_token llama_sample_token(
|
||||
if (mirostat == 1) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
const int mirostat_m = 100;
|
||||
llama_sample_temperature(ctx, &cur_p, temp);
|
||||
llama_sample_temp(ctx, &cur_p, temp);
|
||||
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
||||
} else if (mirostat == 2) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
llama_sample_temperature(ctx, &cur_p, temp);
|
||||
llama_sample_temp(ctx, &cur_p, temp);
|
||||
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||
} else {
|
||||
// Temperature sampling
|
||||
@@ -956,7 +968,7 @@ llama_token llama_sample_token(
|
||||
llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
|
||||
llama_sample_typical (ctx, &cur_p, typical_p, 1);
|
||||
llama_sample_top_p (ctx, &cur_p, top_p, 1);
|
||||
llama_sample_temperature(ctx, &cur_p, temp);
|
||||
llama_sample_temp(ctx, &cur_p, temp);
|
||||
|
||||
{
|
||||
const int n_top = 10;
|
||||
@@ -1185,7 +1197,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
||||
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
||||
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
||||
fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false");
|
||||
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
||||
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
|
||||
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
|
||||
@@ -1225,7 +1236,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
||||
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
|
||||
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
||||
fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
|
||||
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
||||
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
||||
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
||||
@@ -1260,6 +1270,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
||||
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
|
||||
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
||||
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
||||
fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
|
||||
|
||||
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
|
||||
|
||||
@@ -20,8 +20,13 @@
|
||||
#define DIRECTORY_SEPARATOR '/'
|
||||
#endif // _WIN32
|
||||
|
||||
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", ##__VA_ARGS__); exit(1); } while (0)
|
||||
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||
|
||||
#define print_build_info() do { \
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \
|
||||
} while(0)
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
@@ -37,14 +42,16 @@ struct gpt_params {
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||
int32_t n_sequences = 1; // number of sequences to decode
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||
float rope_freq_base = 10000.0f; // RoPE base frequency
|
||||
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||
|
||||
// sampling parameters
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
@@ -102,17 +109,16 @@ struct gpt_params {
|
||||
bool interactive_first = false; // wait for user input immediately
|
||||
bool multiline_input = false; // reverse the usage of `\`
|
||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||
bool cont_batching = false; // insert new sequences for decoding on-the-fly
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool ignore_eos = false; // ignore generated EOS tokens
|
||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||
bool perplexity = false; // compute perplexity over the prompt
|
||||
bool logits_all = false; // return logits for all tokens in the batch
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool mem_test = false; // compute maximum memory usage
|
||||
bool numa = false; // attempt optimizations that help on some NUMA systems
|
||||
bool export_cgraph = false; // export the computation graph
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
};
|
||||
|
||||
@@ -177,7 +183,7 @@ std::string llama_detokenize_bpe(
|
||||
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
|
||||
// - grammar: grammar to use for sampling, ignore if NULL
|
||||
// - last_tokens: needed for repetition penalty, ignore if empty
|
||||
// - idx: sample from llama_get_logits(ctx) + idx * n_vocab
|
||||
// - idx: sample from llama_get_logits_ith(ctx, idx)
|
||||
//
|
||||
// returns:
|
||||
// - token: sampled token
|
||||
|
||||
@@ -158,7 +158,7 @@ namespace console {
|
||||
}
|
||||
}
|
||||
|
||||
char32_t getchar32() {
|
||||
static char32_t getchar32() {
|
||||
#if defined(_WIN32)
|
||||
HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
|
||||
wchar_t high_surrogate = 0;
|
||||
@@ -212,7 +212,7 @@ namespace console {
|
||||
#endif
|
||||
}
|
||||
|
||||
void pop_cursor() {
|
||||
static void pop_cursor() {
|
||||
#if defined(_WIN32)
|
||||
if (hConsole != NULL) {
|
||||
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
|
||||
@@ -233,7 +233,7 @@ namespace console {
|
||||
putc('\b', out);
|
||||
}
|
||||
|
||||
int estimateWidth(char32_t codepoint) {
|
||||
static int estimateWidth(char32_t codepoint) {
|
||||
#if defined(_WIN32)
|
||||
(void)codepoint;
|
||||
return 1;
|
||||
@@ -242,7 +242,7 @@ namespace console {
|
||||
#endif
|
||||
}
|
||||
|
||||
int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
|
||||
static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
|
||||
#if defined(_WIN32)
|
||||
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
|
||||
if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
|
||||
@@ -303,7 +303,7 @@ namespace console {
|
||||
#endif
|
||||
}
|
||||
|
||||
void replace_last(char ch) {
|
||||
static void replace_last(char ch) {
|
||||
#if defined(_WIN32)
|
||||
pop_cursor();
|
||||
put_codepoint(&ch, 1, 1);
|
||||
@@ -312,7 +312,7 @@ namespace console {
|
||||
#endif
|
||||
}
|
||||
|
||||
void append_utf8(char32_t ch, std::string & out) {
|
||||
static void append_utf8(char32_t ch, std::string & out) {
|
||||
if (ch <= 0x7F) {
|
||||
out.push_back(static_cast<unsigned char>(ch));
|
||||
} else if (ch <= 0x7FF) {
|
||||
@@ -333,7 +333,7 @@ namespace console {
|
||||
}
|
||||
|
||||
// Helper function to remove the last UTF-8 character from a string
|
||||
void pop_back_utf8_char(std::string & line) {
|
||||
static void pop_back_utf8_char(std::string & line) {
|
||||
if (line.empty()) {
|
||||
return;
|
||||
}
|
||||
@@ -349,7 +349,7 @@ namespace console {
|
||||
line.erase(pos);
|
||||
}
|
||||
|
||||
bool readline_advanced(std::string & line, bool multiline_input) {
|
||||
static bool readline_advanced(std::string & line, bool multiline_input) {
|
||||
if (out != stdout) {
|
||||
fflush(stdout);
|
||||
}
|
||||
@@ -452,7 +452,7 @@ namespace console {
|
||||
return has_more;
|
||||
}
|
||||
|
||||
bool readline_simple(std::string & line, bool multiline_input) {
|
||||
static bool readline_simple(std::string & line, bool multiline_input) {
|
||||
#if defined(_WIN32)
|
||||
std::wstring wline;
|
||||
if (!std::getline(std::wcin, wline)) {
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
namespace grammar_parser {
|
||||
// NOTE: assumes valid utf8 (but checks for overrun)
|
||||
// copied from llama.cpp
|
||||
std::pair<uint32_t, const char *> decode_utf8(const char * src) {
|
||||
static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
|
||||
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||
uint8_t first_byte = static_cast<uint8_t>(*src);
|
||||
uint8_t highbits = first_byte >> 4;
|
||||
@@ -24,19 +24,19 @@ namespace grammar_parser {
|
||||
return std::make_pair(value, pos);
|
||||
}
|
||||
|
||||
uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
||||
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
||||
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
||||
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
|
||||
return result.first->second;
|
||||
}
|
||||
|
||||
uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
|
||||
static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
|
||||
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
||||
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
|
||||
return next_id;
|
||||
}
|
||||
|
||||
void add_rule(
|
||||
static void add_rule(
|
||||
parse_state & state,
|
||||
uint32_t rule_id,
|
||||
const std::vector<llama_grammar_element> & rule) {
|
||||
@@ -46,11 +46,11 @@ namespace grammar_parser {
|
||||
state.rules[rule_id] = rule;
|
||||
}
|
||||
|
||||
bool is_word_char(char c) {
|
||||
static bool is_word_char(char c) {
|
||||
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
|
||||
}
|
||||
|
||||
std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
|
||||
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
|
||||
const char * pos = src;
|
||||
const char * end = src + size;
|
||||
uint32_t value = 0;
|
||||
@@ -73,7 +73,7 @@ namespace grammar_parser {
|
||||
return std::make_pair(value, pos);
|
||||
}
|
||||
|
||||
const char * parse_space(const char * src, bool newline_ok) {
|
||||
static const char * parse_space(const char * src, bool newline_ok) {
|
||||
const char * pos = src;
|
||||
while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
|
||||
(newline_ok && (*pos == '\r' || *pos == '\n'))) {
|
||||
@@ -88,7 +88,7 @@ namespace grammar_parser {
|
||||
return pos;
|
||||
}
|
||||
|
||||
const char * parse_name(const char * src) {
|
||||
static const char * parse_name(const char * src) {
|
||||
const char * pos = src;
|
||||
while (is_word_char(*pos)) {
|
||||
pos++;
|
||||
@@ -99,7 +99,7 @@ namespace grammar_parser {
|
||||
return pos;
|
||||
}
|
||||
|
||||
std::pair<uint32_t, const char *> parse_char(const char * src) {
|
||||
static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
||||
if (*src == '\\') {
|
||||
switch (src[1]) {
|
||||
case 'x': return parse_hex(src + 2, 2);
|
||||
@@ -129,7 +129,7 @@ namespace grammar_parser {
|
||||
uint32_t rule_id,
|
||||
bool is_nested);
|
||||
|
||||
const char * parse_sequence(
|
||||
static const char * parse_sequence(
|
||||
parse_state & state,
|
||||
const char * src,
|
||||
const std::string & rule_name,
|
||||
@@ -247,7 +247,7 @@ namespace grammar_parser {
|
||||
return pos;
|
||||
}
|
||||
|
||||
const char * parse_rule(parse_state & state, const char * src) {
|
||||
static const char * parse_rule(parse_state & state, const char * src) {
|
||||
const char * name_end = parse_name(src);
|
||||
const char * pos = parse_space(name_end, false);
|
||||
size_t name_len = name_end - src;
|
||||
@@ -285,7 +285,7 @@ namespace grammar_parser {
|
||||
}
|
||||
}
|
||||
|
||||
void print_grammar_char(FILE * file, uint32_t c) {
|
||||
static void print_grammar_char(FILE * file, uint32_t c) {
|
||||
if (0x20 <= c && c <= 0x7f) {
|
||||
fprintf(file, "%c", static_cast<char>(c));
|
||||
} else {
|
||||
@@ -294,7 +294,7 @@ namespace grammar_parser {
|
||||
}
|
||||
}
|
||||
|
||||
bool is_char_element(llama_grammar_element elem) {
|
||||
static bool is_char_element(llama_grammar_element elem) {
|
||||
switch (elem.type) {
|
||||
case LLAMA_GRETYPE_CHAR: return true;
|
||||
case LLAMA_GRETYPE_CHAR_NOT: return true;
|
||||
@@ -304,7 +304,7 @@ namespace grammar_parser {
|
||||
}
|
||||
}
|
||||
|
||||
void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
|
||||
static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
|
||||
for (auto elem : rule) {
|
||||
switch (elem.type) {
|
||||
case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
|
||||
@@ -334,7 +334,7 @@ namespace grammar_parser {
|
||||
fprintf(file, "\n");
|
||||
}
|
||||
|
||||
void print_rule(
|
||||
static void print_rule(
|
||||
FILE * file,
|
||||
uint32_t rule_id,
|
||||
const std::vector<llama_grammar_element> & rule,
|
||||
|
||||
304
convert-baichuan-hf-to-gguf.py
Executable file
304
convert-baichuan-hf-to-gguf.py
Executable file
@@ -0,0 +1,304 @@
|
||||
#!/usr/bin/env python3
|
||||
# HF baichuan --> gguf conversion
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any
|
||||
import itertools
|
||||
import gguf
|
||||
import numpy as np
|
||||
import torch
|
||||
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import TypeAlias
|
||||
|
||||
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
||||
|
||||
# reverse HF permute back to original pth layout
|
||||
|
||||
|
||||
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
|
||||
if n_kv_head is not None and n_head != n_kv_head:
|
||||
n_head //= n_kv_head
|
||||
|
||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(weights.shape))
|
||||
|
||||
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
|
||||
r = weights.shape[0] // 3
|
||||
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
|
||||
|
||||
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
|
||||
r = weights.shape[0] // 3
|
||||
return weights[r * n_part : r * n_part + r, ...]
|
||||
|
||||
def count_model_parts(dir_model: str) -> int:
|
||||
num_parts = 0
|
||||
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith("pytorch_model-"):
|
||||
num_parts += 1
|
||||
|
||||
if num_parts > 0:
|
||||
print("gguf: found " + str(num_parts) + " model parts")
|
||||
|
||||
return num_parts
|
||||
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
|
||||
parser.add_argument(
|
||||
"--vocab-only", action="store_true",
|
||||
help="extract only the vocab",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outfile", type=Path,
|
||||
help="path to write to; default: based on input",
|
||||
)
|
||||
parser.add_argument(
|
||||
"model", type=Path,
|
||||
help="directory containing model file, or model file itself (*.bin)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
||||
help="output format - use 0 for float32, 1 for float16",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
|
||||
dir_model = args.model
|
||||
ftype = args.ftype
|
||||
if not dir_model.is_dir():
|
||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
if args.outfile is not None:
|
||||
fname_out = args.outfile
|
||||
else:
|
||||
# output in the same directory as the model by default
|
||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
||||
|
||||
print("gguf: loading model "+dir_model.name)
|
||||
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
print("hello print: ",hparams["architectures"][0])
|
||||
if hparams["architectures"][0] != "BaichuanForCausalLM":
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
|
||||
sys.exit()
|
||||
|
||||
# get number of model parts
|
||||
num_parts = count_model_parts(dir_model)
|
||||
print(f"num_parts:{num_parts}\n")
|
||||
ARCH=gguf.MODEL_ARCH.BAICHUAN
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
head_count = hparams["num_attention_heads"]
|
||||
|
||||
if "num_key_value_heads" in hparams:
|
||||
head_count_kv = hparams["num_key_value_heads"]
|
||||
else:
|
||||
head_count_kv = head_count
|
||||
|
||||
if "_name_or_path" in hparams:
|
||||
hf_repo = hparams["_name_or_path"]
|
||||
else:
|
||||
hf_repo = ""
|
||||
|
||||
if "max_sequence_length" in hparams:
|
||||
ctx_length = hparams["max_sequence_length"]
|
||||
elif "max_position_embeddings" in hparams:
|
||||
ctx_length = hparams["max_position_embeddings"]
|
||||
elif "model_max_length" in hparams:
|
||||
ctx_length = hparams["model_max_length"]
|
||||
else:
|
||||
print("gguf: can not find ctx length parameter.")
|
||||
|
||||
sys.exit()
|
||||
|
||||
|
||||
gguf_writer.add_name(dir_model.name)
|
||||
gguf_writer.add_source_hf_repo(hf_repo)
|
||||
gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||
gguf_writer.add_context_length(ctx_length)
|
||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
||||
gguf_writer.add_head_count(head_count)
|
||||
gguf_writer.add_head_count_kv(head_count_kv)
|
||||
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||
|
||||
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
||||
if "type" in hparams["rope_scaling"]:
|
||||
if hparams["rope_scaling"]["type"] == "linear":
|
||||
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
|
||||
|
||||
|
||||
# TOKENIZATION
|
||||
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: list[bytes] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
tokenizer_model_file = dir_model / 'tokenizer.model'
|
||||
if not tokenizer_model_file.is_file():
|
||||
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# vocab type sentencepiece
|
||||
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
||||
|
||||
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
||||
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
text: bytes
|
||||
score: float
|
||||
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.get_score(i)
|
||||
|
||||
toktype = 1 # defualt to normal token type
|
||||
if tokenizer.is_unknown(i):
|
||||
toktype = 2
|
||||
if tokenizer.is_control(i):
|
||||
toktype = 3
|
||||
|
||||
# toktype = 4 is user-defined = tokens from added_tokens.json
|
||||
|
||||
if tokenizer.is_unused(i):
|
||||
toktype = 5
|
||||
if tokenizer.is_byte(i):
|
||||
toktype = 6
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
added_tokens_file = dir_model / 'added_tokens.json'
|
||||
if added_tokens_file.is_file():
|
||||
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||
addtokens_json = json.load(f)
|
||||
|
||||
print("gguf: get added tokens")
|
||||
|
||||
for key in addtokens_json:
|
||||
tokens.append( key.encode("utf-8") )
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(4) # user-defined token type
|
||||
|
||||
|
||||
gguf_writer.add_tokenizer_model("llama")
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model)
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||
|
||||
# tensor info
|
||||
print("gguf: get tensor metadata")
|
||||
|
||||
if num_parts == 0:
|
||||
part_names = iter(("pytorch_model.bin",))
|
||||
else:
|
||||
part_names = (
|
||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||
)
|
||||
|
||||
|
||||
for part_name in part_names:
|
||||
if args.vocab_only:
|
||||
break
|
||||
print("gguf: loading model part '" + part_name + "'")
|
||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
||||
|
||||
tmp=model_part
|
||||
for i in range(block_count):
|
||||
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
|
||||
print(f"Unpacking and permuting layer {i}")
|
||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
|
||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
|
||||
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
|
||||
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part[name]
|
||||
# we don't need these
|
||||
if name.endswith(".rotary_emb.inv_freq"):
|
||||
continue
|
||||
|
||||
old_dtype = data.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||
data = data.to(torch.float32)
|
||||
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
if not args.vocab_only:
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
||||
print("")
|
||||
@@ -55,10 +55,22 @@ def count_model_parts(dir_model: Path) -> int:
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
||||
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
|
||||
parser.add_argument(
|
||||
"--vocab-only", action="store_true",
|
||||
help="extract only the vocab",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outfile", type=Path,
|
||||
help="path to write to; default: based on input",
|
||||
)
|
||||
parser.add_argument(
|
||||
"model", type=Path,
|
||||
help="directory containing model file, or model file itself (*.bin)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
||||
help="output format - use 0 for float32, 1 for float16",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
@@ -137,7 +149,9 @@ with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
|
||||
vocab_size = len(tokenizer_json["model"]["vocab"])
|
||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
|
||||
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
|
||||
@@ -56,10 +56,22 @@ def count_model_parts(dir_model: Path) -> int:
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
||||
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
|
||||
parser.add_argument(
|
||||
"--vocab-only", action="store_true",
|
||||
help="extract only the vocab",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outfile", type=Path,
|
||||
help="path to write to; default: based on input",
|
||||
)
|
||||
parser.add_argument(
|
||||
"model", type=Path,
|
||||
help="directory containing model file, or model file itself (*.bin)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
||||
help="output format - use 0 for float32, 1 for float16",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
|
||||
248
convert-starcoder-hf-to-gguf.py
Executable file
248
convert-starcoder-hf-to-gguf.py
Executable file
@@ -0,0 +1,248 @@
|
||||
#!/usr/bin/env python3
|
||||
# HF starcoder --> gguf conversion
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer # type: ignore[import]
|
||||
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
|
||||
def bytes_to_unicode():
|
||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8+n)
|
||||
n += 1
|
||||
return dict(zip(bs, (chr(n) for n in cs)))
|
||||
|
||||
|
||||
def count_model_parts(dir_model: Path) -> int:
|
||||
num_parts = 0
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith("pytorch_model-"):
|
||||
num_parts += 1
|
||||
|
||||
if num_parts > 0:
|
||||
print("gguf: found " + str(num_parts) + " model parts")
|
||||
return num_parts
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
||||
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
|
||||
dir_model = args.model
|
||||
ftype = args.ftype
|
||||
if not dir_model.is_dir():
|
||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
if args.outfile is not None:
|
||||
fname_out = args.outfile
|
||||
else:
|
||||
# output in the same directory as the model by default
|
||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
||||
|
||||
print("gguf: loading model "+dir_model.name)
|
||||
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
|
||||
if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
# get number of model parts
|
||||
num_parts = count_model_parts(dir_model)
|
||||
|
||||
ARCH=gguf.MODEL_ARCH.STARCODER
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["n_layer"]
|
||||
|
||||
gguf_writer.add_name("StarCoder")
|
||||
gguf_writer.add_context_length(hparams["n_positions"])
|
||||
gguf_writer.add_embedding_length(hparams["n_embd"])
|
||||
gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_head_count(hparams["n_head"])
|
||||
gguf_writer.add_head_count_kv(1)
|
||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||
gguf_writer.add_file_type(ftype)
|
||||
|
||||
# TOKENIZATION
|
||||
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: list[bytearray] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
||||
if not tokenizer_json_file.is_file():
|
||||
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# gpt2 tokenizer
|
||||
gguf_writer.add_tokenizer_model("gpt2")
|
||||
|
||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
|
||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
|
||||
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
byte_encoder = bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i in reverse_vocab:
|
||||
try:
|
||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||
except KeyError:
|
||||
text = bytearray()
|
||||
for c in reverse_vocab[i]:
|
||||
if ord(c) < 256: # single byte character
|
||||
text.append(byte_decoder[ord(c)])
|
||||
else: # multibyte special token character
|
||||
text.extend(c.encode('utf-8'))
|
||||
else:
|
||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
||||
pad_token = f"[PAD{i}]".encode("utf8")
|
||||
text = bytearray(pad_token)
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(0.0) # dymmy
|
||||
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||
|
||||
# params for qkv transform
|
||||
n_head = hparams["n_head"]
|
||||
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
||||
|
||||
head_dim = hparams["n_embd"] // n_head
|
||||
|
||||
# tensor info
|
||||
print("gguf: get tensor metadata")
|
||||
|
||||
if num_parts == 0:
|
||||
part_names = iter(("pytorch_model.bin",))
|
||||
else:
|
||||
part_names = (
|
||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||
)
|
||||
|
||||
for part_name in part_names:
|
||||
if args.vocab_only:
|
||||
break
|
||||
print("gguf: loading model part '" + part_name + "'")
|
||||
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part[name]
|
||||
|
||||
old_dtype = data.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||
data = data.to(torch.float32)
|
||||
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
|
||||
gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
if not args.vocab_only:
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
||||
print("")
|
||||
@@ -439,7 +439,7 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
||||
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
||||
#print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
||||
if n_head_kv is not None and n_head != n_head_kv:
|
||||
n_head //= n_head_kv
|
||||
n_head = n_head_kv
|
||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(weights.shape))
|
||||
|
||||
@@ -23,7 +23,9 @@ else()
|
||||
add_subdirectory(train-text-from-scratch)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(embd-input)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(beam-search)
|
||||
|
||||
@@ -9,12 +9,12 @@
|
||||
#endif
|
||||
|
||||
#ifdef LLAMA_DEFAULT_RMS_EPS
|
||||
static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
||||
constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
||||
#else
|
||||
static const float rms_norm_eps = 5e-6f;
|
||||
constexpr float rms_norm_eps = 5e-6f;
|
||||
#endif
|
||||
|
||||
float frand() {
|
||||
static float frand() {
|
||||
return (float)rand()/(float)RAND_MAX;
|
||||
}
|
||||
|
||||
@@ -25,19 +25,21 @@ struct random_normal_distribution {
|
||||
float max;
|
||||
};
|
||||
|
||||
void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
|
||||
static void init_random_normal_distribution(
|
||||
struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
|
||||
) {
|
||||
rnd->gen = std::mt19937(seed);
|
||||
rnd->nd = std::normal_distribution<float>{mean, std};
|
||||
rnd->min = min;
|
||||
rnd->max = max;
|
||||
}
|
||||
|
||||
float frand_normal(struct random_normal_distribution * rnd) {
|
||||
static float frand_normal(struct random_normal_distribution * rnd) {
|
||||
const float r = rnd->nd(rnd->gen);
|
||||
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
|
||||
}
|
||||
|
||||
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
@@ -48,13 +50,9 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
|
||||
ggml_graph_compute(graph, &plan);
|
||||
}
|
||||
|
||||
struct ggml_tensor * randomize_tensor(
|
||||
struct ggml_tensor * tensor,
|
||||
int ndims,
|
||||
const int64_t ne[],
|
||||
float fmin,
|
||||
float fmax) {
|
||||
|
||||
static struct ggml_tensor * randomize_tensor(
|
||||
struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
|
||||
) {
|
||||
switch (ndims) {
|
||||
case 1:
|
||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
||||
@@ -95,11 +93,9 @@ struct ggml_tensor * randomize_tensor(
|
||||
return tensor;
|
||||
}
|
||||
|
||||
struct ggml_tensor * randomize_tensor_normal(
|
||||
struct ggml_tensor * tensor,
|
||||
int ndims,
|
||||
const int64_t ne[],
|
||||
struct random_normal_distribution * rnd) {
|
||||
static struct ggml_tensor * randomize_tensor_normal(
|
||||
struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
|
||||
) {
|
||||
float scale = 1.0; // xavier
|
||||
switch (ndims) {
|
||||
case 1:
|
||||
@@ -159,7 +155,7 @@ struct llama_hparams {
|
||||
}
|
||||
};
|
||||
|
||||
uint32_t get_n_ff(const struct llama_hparams* hparams) {
|
||||
static uint32_t get_n_ff(const struct llama_hparams* hparams) {
|
||||
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
|
||||
return n_ff;
|
||||
}
|
||||
@@ -260,7 +256,7 @@ struct llama_model_lora {
|
||||
std::vector<llama_layer_lora> layers;
|
||||
};
|
||||
|
||||
void init_model(struct llama_model * model) {
|
||||
static void init_model(struct llama_model * model) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
@@ -297,7 +293,7 @@ void init_model(struct llama_model * model) {
|
||||
}
|
||||
|
||||
|
||||
void init_model_lora(struct llama_model_lora * model) {
|
||||
static void init_model_lora(struct llama_model_lora * model) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
@@ -340,7 +336,7 @@ void init_model_lora(struct llama_model_lora * model) {
|
||||
}
|
||||
}
|
||||
|
||||
void set_param_model(struct llama_model * model) {
|
||||
static void set_param_model(struct llama_model * model) {
|
||||
const auto& hparams = model->hparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
@@ -366,7 +362,7 @@ void set_param_model(struct llama_model * model) {
|
||||
}
|
||||
}
|
||||
|
||||
void set_param_model_lora(struct llama_model_lora * model) {
|
||||
static void set_param_model_lora(struct llama_model_lora * model) {
|
||||
const auto& hparams = model->hparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
@@ -397,7 +393,7 @@ void set_param_model_lora(struct llama_model_lora * model) {
|
||||
}
|
||||
}
|
||||
|
||||
void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
|
||||
static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
@@ -426,7 +422,9 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std
|
||||
}
|
||||
|
||||
|
||||
void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) {
|
||||
static void randomize_model_lora(
|
||||
struct llama_model_lora * model, int seed, float mean, float std, float min, float max
|
||||
) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
@@ -459,7 +457,7 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean,
|
||||
}
|
||||
}
|
||||
|
||||
bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
|
||||
static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_ctx = hparams.n_ctx;
|
||||
@@ -495,7 +493,7 @@ bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int
|
||||
return true;
|
||||
}
|
||||
|
||||
bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
|
||||
static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_ctx = hparams.n_ctx;
|
||||
@@ -531,15 +529,15 @@ bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora *
|
||||
return true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * forward(
|
||||
struct llama_model * model,
|
||||
struct llama_kv_cache * cache,
|
||||
struct ggml_context * ctx0,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * tokens_input,
|
||||
const int n_tokens,
|
||||
const int n_past) {
|
||||
|
||||
static struct ggml_tensor * forward(
|
||||
struct llama_model * model,
|
||||
struct llama_kv_cache * cache,
|
||||
struct ggml_context * ctx0,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * tokens_input,
|
||||
const int n_tokens,
|
||||
const int n_past
|
||||
) {
|
||||
const int N = n_tokens;
|
||||
|
||||
struct llama_kv_cache& kv_self = *cache;
|
||||
@@ -556,6 +554,14 @@ struct ggml_tensor * forward(
|
||||
struct ggml_tensor * kc = kv_self.k;
|
||||
struct ggml_tensor * vc = kv_self.v;
|
||||
|
||||
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
{
|
||||
int * data = (int *) KQ_pos->data;
|
||||
for (int i = 0; i < N; ++i) {
|
||||
data[i] = n_past + i;
|
||||
}
|
||||
}
|
||||
|
||||
// inpL shape [n_embd,N,1,1]
|
||||
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -583,8 +589,8 @@ struct ggml_tensor * forward(
|
||||
// wk shape [n_embd, n_embd, 1, 1]
|
||||
// Qcur shape [n_embd/n_head, n_head, N, 1]
|
||||
// Kcur shape [n_embd/n_head, n_head, N, 1]
|
||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
|
||||
|
||||
// store key and value to memory
|
||||
{
|
||||
@@ -756,25 +762,25 @@ struct ggml_tensor * forward(
|
||||
return inpL;
|
||||
}
|
||||
|
||||
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
|
||||
static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
|
||||
GGML_ASSERT(tensor->n_dims == 1);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
}
|
||||
|
||||
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
|
||||
static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
|
||||
GGML_ASSERT(tensor->n_dims == 2);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||
}
|
||||
|
||||
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
|
||||
static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
|
||||
GGML_ASSERT(tensor->n_dims == 3);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||
GGML_ASSERT(tensor->ne[2] == ne2);
|
||||
}
|
||||
|
||||
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
||||
static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
||||
GGML_ASSERT(tensor->n_dims == 4);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||
@@ -782,16 +788,16 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
|
||||
GGML_ASSERT(tensor->ne[3] == ne3);
|
||||
}
|
||||
|
||||
struct ggml_tensor * forward_batch(
|
||||
struct llama_model * model,
|
||||
struct llama_kv_cache * cache,
|
||||
struct ggml_context * ctx0,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * tokens_input,
|
||||
const int n_tokens,
|
||||
const int n_past,
|
||||
const int n_batch) {
|
||||
|
||||
static struct ggml_tensor * forward_batch(
|
||||
struct llama_model * model,
|
||||
struct llama_kv_cache * cache,
|
||||
struct ggml_context * ctx0,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * tokens_input,
|
||||
const int n_tokens,
|
||||
const int n_past,
|
||||
const int n_batch
|
||||
) {
|
||||
const int N = n_tokens;
|
||||
|
||||
struct llama_kv_cache& kv_self = *cache;
|
||||
@@ -810,9 +816,18 @@ struct ggml_tensor * forward_batch(
|
||||
struct ggml_tensor * kc = kv_self.k;
|
||||
struct ggml_tensor * vc = kv_self.v;
|
||||
|
||||
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
{
|
||||
int * data = (int *) KQ_pos->data;
|
||||
for (int i = 0; i < N; ++i) {
|
||||
data[i] = n_past + i;
|
||||
}
|
||||
}
|
||||
|
||||
// inpL shape [n_embd,N*n_batch,1]
|
||||
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
|
||||
assert_shape_2d(inpL, n_embd, N*n_batch);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
@@ -840,8 +855,8 @@ struct ggml_tensor * forward_batch(
|
||||
// wk shape [n_embd, n_embd, 1, 1]
|
||||
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
|
||||
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
|
||||
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
|
||||
|
||||
@@ -1073,16 +1088,15 @@ struct ggml_tensor * forward_batch(
|
||||
return inpL;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * forward_lora(
|
||||
struct llama_model_lora * model,
|
||||
struct llama_kv_cache * cache,
|
||||
struct ggml_context * ctx0,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * tokens_input,
|
||||
const int n_tokens,
|
||||
const int n_past) {
|
||||
|
||||
static struct ggml_tensor * forward_lora(
|
||||
struct llama_model_lora * model,
|
||||
struct llama_kv_cache * cache,
|
||||
struct ggml_context * ctx0,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * tokens_input,
|
||||
const int n_tokens,
|
||||
const int n_past
|
||||
) {
|
||||
const int N = n_tokens;
|
||||
|
||||
struct llama_kv_cache& kv_self = *cache;
|
||||
@@ -1100,6 +1114,14 @@ struct ggml_tensor * forward_lora(
|
||||
struct ggml_tensor * kc = kv_self.k;
|
||||
struct ggml_tensor * vc = kv_self.v;
|
||||
|
||||
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
{
|
||||
int * data = (int *) KQ_pos->data;
|
||||
for (int i = 0; i < N; ++i) {
|
||||
data[i] = n_past + i;
|
||||
}
|
||||
}
|
||||
|
||||
// inpL shape [n_embd,N,1,1]
|
||||
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -1133,7 +1155,7 @@ struct ggml_tensor * forward_lora(
|
||||
model->layers[il].wqb,
|
||||
cur)),
|
||||
n_embd/n_head, n_head, N),
|
||||
n_past, n_rot, 0, 0);
|
||||
KQ_pos, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_mul_mat(ctx0,
|
||||
@@ -1142,7 +1164,7 @@ struct ggml_tensor * forward_lora(
|
||||
model->layers[il].wkb,
|
||||
cur)),
|
||||
n_embd/n_head, n_head, N),
|
||||
n_past, n_rot, 0, 0);
|
||||
KQ_pos, n_rot, 0, 0);
|
||||
|
||||
// store key and value to memory
|
||||
{
|
||||
@@ -1328,7 +1350,7 @@ struct ggml_tensor * forward_lora(
|
||||
return inpL;
|
||||
}
|
||||
|
||||
void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
|
||||
static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
|
||||
assert(logits->n_dims == 2);
|
||||
assert(probs->n_dims == 2);
|
||||
assert(best_samples->n_dims == 1);
|
||||
@@ -1359,7 +1381,10 @@ void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, str
|
||||
}
|
||||
}
|
||||
|
||||
void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
|
||||
static void sample_softmax_batch(
|
||||
struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
|
||||
struct ggml_tensor * best_samples
|
||||
) {
|
||||
GGML_ASSERT(best_samples->n_dims == 2);
|
||||
GGML_ASSERT(logits->n_dims == 3);
|
||||
GGML_ASSERT(probs->n_dims == 3);
|
||||
@@ -1393,7 +1418,7 @@ void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits
|
||||
}
|
||||
}
|
||||
|
||||
void print_row(struct ggml_tensor * probs, int i) {
|
||||
static void print_row(struct ggml_tensor * probs, int i) {
|
||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||
float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
|
||||
printf(" %.2f", p);
|
||||
@@ -1401,7 +1426,7 @@ void print_row(struct ggml_tensor * probs, int i) {
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void print_matrix(struct ggml_tensor * probs) {
|
||||
static void print_matrix(struct ggml_tensor * probs) {
|
||||
assert(probs->n_dims == 2);
|
||||
for (int i = 0; i < probs->ne[1]; ++i) {
|
||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||
@@ -1412,7 +1437,7 @@ void print_matrix(struct ggml_tensor * probs) {
|
||||
}
|
||||
}
|
||||
|
||||
void print_token(int token, int n_vocab) {
|
||||
static void print_token(int token, int n_vocab) {
|
||||
for (int k = 0; k < token; ++k) {
|
||||
printf(" ");
|
||||
}
|
||||
@@ -1423,14 +1448,14 @@ void print_token(int token, int n_vocab) {
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
|
||||
static void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
|
||||
for (int i=0; i<tokens->ne[0]; ++i) {
|
||||
int token = ggml_get_i32_1d(tokens, i);
|
||||
print_token(token, n_vocab);
|
||||
}
|
||||
}
|
||||
|
||||
void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
|
||||
static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
|
||||
int n_tokens = tokens_input->ne[0];
|
||||
int n_vocab = targets->ne[0];
|
||||
float randomness = 0.0f;
|
||||
@@ -1451,7 +1476,9 @@ void get_example_targets(int example_id, struct ggml_tensor * tokens_input, stru
|
||||
}
|
||||
}
|
||||
|
||||
void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
|
||||
static void get_example_targets_batch(
|
||||
struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
|
||||
) {
|
||||
GGML_ASSERT(tokens_input->n_dims == 2);
|
||||
GGML_ASSERT( targets->n_dims == 3);
|
||||
int n_tokens = tokens_input->ne[0];
|
||||
@@ -1474,7 +1501,7 @@ void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct
|
||||
}
|
||||
}
|
||||
|
||||
void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
|
||||
static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
|
||||
int n_tokens = tokens_input->ne[0];
|
||||
int n_vocab = targets->ne[0];
|
||||
for (int i=0; i<n_tokens-n_shift; ++i) {
|
||||
@@ -1485,12 +1512,16 @@ void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * tar
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
||||
static struct ggml_tensor * square_error_loss(
|
||||
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
|
||||
) {
|
||||
// todo: instead of a-b: a[1:]-b[:-1]
|
||||
return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
|
||||
}
|
||||
|
||||
struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
||||
static struct ggml_tensor * cross_entropy_loss(
|
||||
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
|
||||
) {
|
||||
const float eps = 1e-3f;
|
||||
return
|
||||
ggml_sum(ctx,
|
||||
|
||||
5
examples/batched/CMakeLists.txt
Normal file
5
examples/batched/CMakeLists.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
set(TARGET batched)
|
||||
add_executable(${TARGET} batched.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
44
examples/batched/README.md
Normal file
44
examples/batched/README.md
Normal file
@@ -0,0 +1,44 @@
|
||||
# llama.cpp/example/batched
|
||||
|
||||
The example demonstrates batched generation from a given prompt
|
||||
|
||||
```bash
|
||||
./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
|
||||
|
||||
...
|
||||
|
||||
main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
|
||||
|
||||
Hello my name is
|
||||
|
||||
main: generating 4 sequences ...
|
||||
|
||||
main: stream 0 finished
|
||||
main: stream 1 finished
|
||||
main: stream 2 finished
|
||||
main: stream 3 finished
|
||||
|
||||
sequence 0:
|
||||
|
||||
Hello my name is Shirley. I am a 25-year-old female who has been working for over 5 years as a b
|
||||
|
||||
sequence 1:
|
||||
|
||||
Hello my name is Renee and I'm a 32 year old female from the United States. I'm looking for a man between
|
||||
|
||||
sequence 2:
|
||||
|
||||
Hello my name is Diana. I am looking for a housekeeping job. I have experience with children and have my own transportation. I am
|
||||
|
||||
sequence 3:
|
||||
|
||||
Hello my name is Cody. I am a 3 year old neutered male. I am a very friendly cat. I am very playful and
|
||||
|
||||
main: decoded 108 tokens in 3.57 s, speed: 30.26 t/s
|
||||
|
||||
llama_print_timings: load time = 587.00 ms
|
||||
llama_print_timings: sample time = 2.56 ms / 112 runs ( 0.02 ms per token, 43664.72 tokens per second)
|
||||
llama_print_timings: prompt eval time = 4089.11 ms / 118 tokens ( 34.65 ms per token, 28.86 tokens per second)
|
||||
llama_print_timings: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
||||
llama_print_timings: total time = 4156.04 ms
|
||||
```
|
||||
246
examples/batched/batched.cpp
Normal file
246
examples/batched/batched.cpp
Normal file
@@ -0,0 +1,246 @@
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (argc == 1 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
|
||||
return 1 ;
|
||||
}
|
||||
|
||||
int n_parallel = 1;
|
||||
|
||||
if (argc >= 2) {
|
||||
params.model = argv[1];
|
||||
}
|
||||
|
||||
if (argc >= 3) {
|
||||
params.prompt = argv[2];
|
||||
}
|
||||
|
||||
if (argc >= 4) {
|
||||
n_parallel = std::atoi(argv[3]);
|
||||
}
|
||||
|
||||
if (params.prompt.empty()) {
|
||||
params.prompt = "Hello my name is";
|
||||
}
|
||||
|
||||
// total length of the sequences including the prompt
|
||||
const int n_len = 32;
|
||||
|
||||
// init LLM
|
||||
|
||||
llama_backend_init(params.numa);
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
|
||||
ctx_params.seed = 1234;
|
||||
ctx_params.n_ctx = n_len*n_parallel; // FIXME: use n_kv_req instead (tokenize with model after #3301)
|
||||
ctx_params.n_batch = std::max(n_len, n_parallel);
|
||||
// ctx_params.n_gpu_layers = 99; // offload all layers to the GPU
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// tokenize the prompt
|
||||
|
||||
std::vector<llama_token> tokens_list;
|
||||
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
|
||||
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
||||
|
||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||
if (n_kv_req > n_ctx) {
|
||||
LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
|
||||
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// print the prompt token-by-token
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
for (auto id : tokens_list) {
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
|
||||
// create a llama_batch with size 512
|
||||
// we use this object to submit token data for decoding
|
||||
|
||||
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0);
|
||||
|
||||
// evaluate the initial prompt
|
||||
batch.n_tokens = tokens_list.size();
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
batch.token[i] = tokens_list[i];
|
||||
batch.pos[i] = i;
|
||||
batch.seq_id[i] = 0;
|
||||
batch.logits[i] = false;
|
||||
}
|
||||
|
||||
// llama_decode will output logits only for the last token of the prompt
|
||||
batch.logits[batch.n_tokens - 1] = true;
|
||||
|
||||
if (llama_decode(ctx, batch, params.n_threads) != 0) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// assign the system KV cache to all parallel sequences
|
||||
// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
|
||||
for (int32_t i = 1; i < n_parallel; ++i) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens);
|
||||
}
|
||||
|
||||
if (n_parallel > 1) {
|
||||
LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
|
||||
}
|
||||
|
||||
// main loop
|
||||
|
||||
// we will store the parallel decoded sequences in this vector
|
||||
std::vector<std::string> streams(n_parallel);
|
||||
|
||||
// remember the batch index of the last token for each parallel sequence
|
||||
// we need this to determine which logits to sample from
|
||||
std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
|
||||
|
||||
int n_cur = batch.n_tokens;
|
||||
int n_decode = 0;
|
||||
|
||||
const auto t_main_start = ggml_time_us();
|
||||
|
||||
while (n_cur <= n_len) {
|
||||
// prepare the next batch
|
||||
batch.n_tokens = 0;
|
||||
|
||||
// sample the next token for each parallel sequence / stream
|
||||
for (int32_t i = 0; i < n_parallel; ++i) {
|
||||
if (i_batch[i] < 0) {
|
||||
// the stream has already finished
|
||||
continue;
|
||||
}
|
||||
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
auto * logits = llama_get_logits_ith(ctx, i_batch[i]);
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
const int top_k = 40;
|
||||
const float top_p = 0.9f;
|
||||
const float temp = 0.4f;
|
||||
|
||||
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
|
||||
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
||||
llama_sample_temp (ctx, &candidates_p, temp);
|
||||
|
||||
const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
|
||||
|
||||
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
|
||||
// is it an end of stream? -> mark the stream as finished
|
||||
if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
|
||||
i_batch[i] = -1;
|
||||
LOG_TEE("\n");
|
||||
if (n_parallel > 1) {
|
||||
LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// if there is only one stream, we print immediately to stdout
|
||||
if (n_parallel == 1) {
|
||||
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
streams[i] += llama_token_to_piece(ctx, new_token_id);
|
||||
|
||||
// push this new token for next evaluation
|
||||
batch.token [batch.n_tokens] = new_token_id;
|
||||
batch.pos [batch.n_tokens] = n_cur;
|
||||
batch.seq_id[batch.n_tokens] = i;
|
||||
batch.logits[batch.n_tokens] = true;
|
||||
|
||||
i_batch[i] = batch.n_tokens;
|
||||
|
||||
batch.n_tokens += 1;
|
||||
|
||||
n_decode += 1;
|
||||
}
|
||||
|
||||
// all streams are finished
|
||||
if (batch.n_tokens == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
n_cur += 1;
|
||||
|
||||
// evaluate the current batch with the transformer model
|
||||
if (llama_decode(ctx, batch, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
|
||||
if (n_parallel > 1) {
|
||||
LOG_TEE("\n");
|
||||
|
||||
for (int32_t i = 0; i < n_parallel; ++i) {
|
||||
LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
|
||||
}
|
||||
}
|
||||
|
||||
const auto t_main_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||
|
||||
llama_print_timings(ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} beam-search.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
@@ -30,7 +29,8 @@ struct ostream_beam_view {
|
||||
llama_context * ctx;
|
||||
llama_beam_view beam_view;
|
||||
};
|
||||
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
|
||||
|
||||
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
|
||||
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
||||
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
||||
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
|
||||
@@ -46,7 +46,7 @@ struct beam_search_callback_data {
|
||||
|
||||
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
||||
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
||||
bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) {
|
||||
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
|
||||
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
|
||||
}
|
||||
|
||||
@@ -56,7 +56,7 @@ bool is_at_eob(const beam_search_callback_data & callback_data, const llama_toke
|
||||
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
||||
// This is also called when the stop condition is met.
|
||||
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
||||
void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
||||
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
||||
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
|
||||
// Mark beams as EOS as needed.
|
||||
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
||||
@@ -158,8 +158,9 @@ int main(int argc, char ** argv)
|
||||
}
|
||||
std::cout << std::flush;
|
||||
|
||||
int n_past = llama_get_kv_cache_token_count(ctx);
|
||||
if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
|
||||
int n_past = 0;
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0), params.n_threads))
|
||||
{
|
||||
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
|
||||
return 1;
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
set(TARGET benchmark)
|
||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "ggml.h"
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <locale.h>
|
||||
#include <assert.h>
|
||||
@@ -20,7 +21,7 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
@@ -31,19 +32,19 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
|
||||
ggml_graph_compute(graph, &plan);
|
||||
}
|
||||
|
||||
float tensor_sum_elements(const ggml_tensor * tensor) {
|
||||
float sum = 0;
|
||||
if (tensor->type==GGML_TYPE_F32) {
|
||||
static float tensor_sum_elements(const ggml_tensor * tensor) {
|
||||
double sum = 0;
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
for (int j = 0; j < tensor->ne[1]; j++) {
|
||||
for (int k = 0; k < tensor->ne[0]; k++) {
|
||||
sum += ((float *) tensor->data)[j*tensor->ne[0]+k];
|
||||
sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
|
||||
}
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
||||
static void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
||||
printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
|
||||
tensor->type, ggml_type_name(tensor->type),
|
||||
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
|
||||
@@ -58,7 +59,7 @@ struct benchmark_params_struct {
|
||||
int32_t n_iterations = 10;
|
||||
};
|
||||
|
||||
void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
|
||||
static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
@@ -99,7 +100,7 @@ int main(int argc, char ** argv) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
print_build_info();
|
||||
printf("Starting Test\n");
|
||||
|
||||
// create the ggml context
|
||||
@@ -125,12 +126,15 @@ int main(int argc, char ** argv) {
|
||||
|
||||
//printf("Memsize required = %i\n", sizex*sizex);
|
||||
|
||||
// TODO: perform the bench for all types or for a user specified type
|
||||
const ggml_type qtype = GGML_TYPE_Q4_1;
|
||||
|
||||
size_t ctx_size = 0;
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
|
||||
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
|
||||
ctx_size += 1024*1024*16;
|
||||
@@ -163,7 +167,7 @@ int main(int argc, char ** argv) {
|
||||
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
|
||||
ggml_set_f32(m2, 2.0f);
|
||||
|
||||
printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
|
||||
printf("\n------ Test 1 - Matrix Mult via F32 code\n");
|
||||
// printf("Creating new tensor m11xm2\n");
|
||||
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
|
||||
|
||||
@@ -181,17 +185,16 @@ int main(int argc, char ** argv) {
|
||||
|
||||
TENSOR_DUMP(gf.nodes[0]);
|
||||
|
||||
printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
|
||||
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
||||
|
||||
int32_t nelements = sizex*sizey;
|
||||
int32_t ne[2] = { sizex, sizey };
|
||||
|
||||
std::vector<int64_t> hist_cur(1 << 4, 0);
|
||||
|
||||
// Set up a the benchmark matrices
|
||||
// printf("Creating new tensor q11 & Running quantize\n");
|
||||
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
|
||||
ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
|
||||
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
||||
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
|
||||
|
||||
// Set up a the compute graph
|
||||
// printf("Creating new tensor q31\n");
|
||||
@@ -202,8 +205,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// Set up a second graph computation to make sure we override the CPU cache lines
|
||||
// printf("Creating new tensor q12 & Running quantize\n");
|
||||
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
|
||||
ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
|
||||
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
||||
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
|
||||
|
||||
// printf("Creating new tensor q32\n");
|
||||
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
|
||||
@@ -220,7 +223,7 @@ int main(int argc, char ** argv) {
|
||||
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
|
||||
|
||||
|
||||
// Let's use the F32 result from above as a reference for the q4_0 multiplication
|
||||
// Let's use the F32 result from above as a reference for the quantized multiplication
|
||||
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
|
||||
|
||||
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
||||
@@ -250,7 +253,7 @@ int main(int argc, char ** argv) {
|
||||
// Check that the matrix multiplication result is in the right ballpark
|
||||
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
||||
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
|
||||
float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
||||
|
||||
if (delta > allowed_delta) {
|
||||
|
||||
@@ -115,7 +115,7 @@ struct TransformerWeights {
|
||||
}
|
||||
};
|
||||
|
||||
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
||||
static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
||||
// we calloc instead of malloc to keep valgrind happy
|
||||
w->token_embedding_table = new float[p->vocab_size * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
@@ -158,7 +158,7 @@ void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
||||
}
|
||||
}
|
||||
|
||||
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
|
||||
static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
|
||||
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
|
||||
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
|
||||
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
||||
@@ -189,7 +189,7 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar
|
||||
return 0;
|
||||
}
|
||||
|
||||
void print_sample_weights(TransformerWeights *w){
|
||||
static void print_sample_weights(TransformerWeights *w){
|
||||
printf("----- Quick print of first of the weight vales of all the variables\n");
|
||||
printf("%f\n", w->token_embedding_table[0]);
|
||||
printf("%f\n", w->rms_att_weight[0]);
|
||||
@@ -324,7 +324,7 @@ struct train_params {
|
||||
int mem_compute1_gb;
|
||||
};
|
||||
|
||||
void print_params(struct my_llama_hparams * params) {
|
||||
static void print_params(struct my_llama_hparams * params) {
|
||||
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
||||
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
||||
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
||||
@@ -335,7 +335,7 @@ void print_params(struct my_llama_hparams * params) {
|
||||
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
||||
}
|
||||
|
||||
void init_model(struct my_llama_model * model) {
|
||||
static void init_model(struct my_llama_model * model) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
@@ -408,17 +408,17 @@ void init_model(struct my_llama_model * model) {
|
||||
}
|
||||
}
|
||||
|
||||
float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||
static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
||||
return *ptr;
|
||||
}
|
||||
|
||||
int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||
static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
||||
return *ptr;
|
||||
}
|
||||
|
||||
void print_row(struct ggml_tensor * probs, int i) {
|
||||
static void print_row(struct ggml_tensor * probs, int i) {
|
||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||
float p = get_f32_2d(probs, k, i);
|
||||
printf(" %f", p);
|
||||
@@ -426,7 +426,7 @@ void print_row(struct ggml_tensor * probs, int i) {
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void print_matrix(struct ggml_tensor * probs) {
|
||||
static void print_matrix(struct ggml_tensor * probs) {
|
||||
assert(probs->n_dims == 2);
|
||||
for (int i = 0; i < probs->ne[1]; ++i) {
|
||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||
@@ -531,7 +531,7 @@ struct llama_file {
|
||||
}
|
||||
};
|
||||
|
||||
bool is_ggml_file(const char *filename) {
|
||||
static bool is_ggml_file(const char * filename) {
|
||||
llama_file file(filename, "rb");
|
||||
if (file.size < 4) {
|
||||
return false;
|
||||
@@ -540,7 +540,7 @@ bool is_ggml_file(const char *filename) {
|
||||
return magic == GGUF_MAGIC;
|
||||
}
|
||||
|
||||
static std::string llama_escape_whitespaces(const std::string& text) {
|
||||
static std::string llama_escape_whitespaces(const std::string & text) {
|
||||
std::ostringstream out;
|
||||
for (char c : text) {
|
||||
if (c == ' ') out << "\xe2\x96\x81";
|
||||
@@ -549,7 +549,7 @@ static std::string llama_escape_whitespaces(const std::string& text) {
|
||||
return out.str();
|
||||
}
|
||||
|
||||
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
||||
static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
||||
if (is_ggml_file(filename)) {
|
||||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
@@ -637,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
|
||||
}
|
||||
}
|
||||
|
||||
void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
|
||||
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
|
||||
int ct;
|
||||
switch (gg_weights->n_dims){
|
||||
case 1:
|
||||
@@ -673,7 +673,9 @@ void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * kar
|
||||
}
|
||||
}
|
||||
|
||||
void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
|
||||
static void save_as_llama_model(
|
||||
struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
|
||||
) {
|
||||
// convert AK weights into GG weights one by one.
|
||||
// w->token_embedding_table -> model->tok_embeddings
|
||||
// float* -> struct ggml_tensor
|
||||
@@ -785,7 +787,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
|
||||
gguf_free(ctx);
|
||||
}
|
||||
|
||||
struct train_params get_default_train_params() {
|
||||
static struct train_params get_default_train_params() {
|
||||
struct train_params params;
|
||||
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
|
||||
params.fn_llama2c_output_model = "ak_llama_model.bin";
|
||||
@@ -835,7 +837,7 @@ struct train_params get_default_train_params() {
|
||||
return params;
|
||||
}
|
||||
|
||||
void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
|
||||
static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
@@ -846,7 +848,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
bool params_parse(int argc, char ** argv, struct train_params * params) {
|
||||
static bool params_parse(int argc, char ** argv, struct train_params * params) {
|
||||
bool invalid_param = false;
|
||||
bool reqd_param_found = false;
|
||||
std::string arg;
|
||||
@@ -901,7 +903,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string basename(const std::string &path) {
|
||||
static std::string basename(const std::string &path) {
|
||||
size_t pos = path.find_last_of("/\\");
|
||||
if (pos == std::string::npos) {
|
||||
return path;
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "embd-input.h"
|
||||
|
||||
#include <cassert>
|
||||
@@ -22,7 +24,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
print_build_info();
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = uint32_t(time(NULL));
|
||||
@@ -78,7 +80,8 @@ bool eval_float(void * model, float * input, int N){
|
||||
if (n_eval > n_batch) {
|
||||
n_eval = n_batch;
|
||||
}
|
||||
if (llama_eval_embd(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) {
|
||||
llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
|
||||
if (llama_decode(ctx, batch, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
@@ -99,7 +102,7 @@ bool eval_tokens(void * model, std::vector<llama_token> tokens) {
|
||||
if (n_eval > params.n_batch) {
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) {
|
||||
if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0), params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
@@ -181,11 +184,11 @@ llama_token sampling_id(struct MyModel* mymodel) {
|
||||
if (mirostat == 1) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
const int mirostat_m = 100;
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
llama_sample_temp(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
||||
} else if (mirostat == 2) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
llama_sample_temp(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||
} else {
|
||||
// Temperature sampling
|
||||
@@ -193,7 +196,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
|
||||
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
|
||||
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
|
||||
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
llama_sample_temp(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token(ctx, &candidates_p);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
extern "C" {
|
||||
|
||||
|
||||
@@ -1,3 +1,21 @@
|
||||
# embedding
|
||||
# llama.cpp/example/embedding
|
||||
|
||||
TODO
|
||||
This example demonstrates generate high-dimensional embedding vector of a given text with llama.cpp.
|
||||
|
||||
## Quick Start
|
||||
|
||||
To get started right away, run the following command, making sure to use the correct path for the model you have:
|
||||
|
||||
### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
|
||||
```
|
||||
|
||||
### Windows:
|
||||
|
||||
```powershell
|
||||
embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
||||
```
|
||||
|
||||
The above command will output space-separated float values.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <ctime>
|
||||
|
||||
@@ -17,7 +17,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
params.embedding = true;
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
print_build_info();
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
@@ -78,7 +78,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
while (!embd_inp.empty()) {
|
||||
int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
|
||||
if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) {
|
||||
if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0), params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -13,14 +13,14 @@
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
template<typename T>
|
||||
template <typename T>
|
||||
static std::string to_string(const T & val) {
|
||||
std::stringstream ss;
|
||||
ss << val;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
bool gguf_ex_write(const std::string & fname) {
|
||||
static bool gguf_ex_write(const std::string & fname) {
|
||||
struct gguf_context * ctx = gguf_init_empty();
|
||||
|
||||
gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
|
||||
@@ -85,7 +85,7 @@ bool gguf_ex_write(const std::string & fname) {
|
||||
}
|
||||
|
||||
// just read tensor info
|
||||
bool gguf_ex_read_0(const std::string & fname) {
|
||||
static bool gguf_ex_read_0(const std::string & fname) {
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ NULL,
|
||||
@@ -143,7 +143,7 @@ bool gguf_ex_read_0(const std::string & fname) {
|
||||
}
|
||||
|
||||
// read and create ggml_context containing the tensors and their data
|
||||
bool gguf_ex_read_1(const std::string & fname) {
|
||||
static bool gguf_ex_read_1(const std::string & fname) {
|
||||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
|
||||
@@ -367,10 +367,10 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
|
||||
keyidx = gguf_find_key(ggufctx, "general.architecture");
|
||||
if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||
keyidx = gguf_find_key(ggufctx, "general.file_type");
|
||||
if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||
if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
|
||||
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
|
||||
if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
|
||||
keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
|
||||
if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||
}
|
||||
|
||||
|
||||
@@ -380,10 +380,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
|
||||
keyidx = gguf_find_key(ggufctx, "general.architecture");
|
||||
if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||
keyidx = gguf_find_key(ggufctx, "general.file_type");
|
||||
if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||
if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
|
||||
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
|
||||
if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
|
||||
keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
|
||||
if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||
}
|
||||
|
||||
|
||||
271
examples/llama-bench/README.md
Normal file
271
examples/llama-bench/README.md
Normal file
@@ -0,0 +1,271 @@
|
||||
# llama.cpp/example/llama-bench
|
||||
|
||||
Performance testing tool for llama.cpp.
|
||||
|
||||
## Table of contents
|
||||
|
||||
1. [Syntax](#syntax)
|
||||
2. [Examples](#examples)
|
||||
1. [Text generation with different models](#text-generation-with-different-models)
|
||||
2. [Prompt processing with different batch sizes](#prompt-processing-with-different-batch-sizes)
|
||||
3. [Different numbers of threads](#different-numbers-of-threads)
|
||||
4. [Different numbers of layers offloaded to the GPU](#different-numbers-of-layers-offloaded-to-the-gpu)
|
||||
3. [Output formats](#output-formats)
|
||||
1. [Markdown](#markdown)
|
||||
2. [CSV](#csv)
|
||||
3. [JSON](#json)
|
||||
4. [SQL](#sql)
|
||||
|
||||
## Syntax
|
||||
|
||||
```
|
||||
usage: ./llama-bench [options]
|
||||
|
||||
options:
|
||||
-h, --help
|
||||
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
||||
-p, --n-prompt <n> (default: 512)
|
||||
-n, --n-gen <n> (default: 128)
|
||||
-b, --batch-size <n> (default: 512)
|
||||
--memory-f32 <0|1> (default: 0)
|
||||
-t, --threads <n> (default: 16)
|
||||
-ngl N, --n-gpu-layers <n> (default: 99)
|
||||
-mg i, --main-gpu <i> (default: 0)
|
||||
-mmq, --mul-mat-q <0|1> (default: 1)
|
||||
-ts, --tensor_split <ts0/ts1/..>
|
||||
-r, --repetitions <n> (default: 5)
|
||||
-o, --output <csv|json|md|sql> (default: md)
|
||||
-v, --verbose (default: 0)
|
||||
|
||||
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
|
||||
```
|
||||
|
||||
llama-bench can perform two types of tests:
|
||||
|
||||
- Prompt processing (pp): processing a prompt in batches (`-p`)
|
||||
- Text generation (tg): generating a sequence of tokens (`-n`)
|
||||
|
||||
With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).
|
||||
|
||||
Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
|
||||
|
||||
For a description of the other options, see the [main example](../main/README.md).
|
||||
|
||||
## Examples
|
||||
|
||||
### Text generation with different models
|
||||
|
||||
```sh
|
||||
$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512
|
||||
```
|
||||
|
||||
| model | size | params | backend | ngl | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ± 0.55 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ± 0.54 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ± 0.25 |
|
||||
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ± 0.31 |
|
||||
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ± 0.23 |
|
||||
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ± 0.07 |
|
||||
|
||||
### Prompt processing with different batch sizes
|
||||
|
||||
```sh
|
||||
$ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024
|
||||
```
|
||||
|
||||
| model | size | params | backend | ngl | n_batch | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ± 3.66 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ± 23.48 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ± 15.59 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ± 13.58 |
|
||||
|
||||
### Different numbers of threads
|
||||
|
||||
```sh
|
||||
$ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32
|
||||
```
|
||||
|
||||
| model | size | params | backend | threads | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | pp 64 | 6.17 ± 0.07 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | tg 16 | 4.05 ± 0.02 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | pp 64 | 12.31 ± 0.13 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | tg 16 | 7.80 ± 0.07 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | pp 64 | 23.18 ± 0.06 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | tg 16 | 12.22 ± 0.07 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | pp 64 | 32.29 ± 1.21 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | tg 16 | 16.71 ± 0.66 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | pp 64 | 33.52 ± 0.03 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | tg 16 | 15.32 ± 0.05 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | pp 64 | 59.00 ± 1.11 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | tg 16 | 16.41 ± 0.79 ||
|
||||
|
||||
### Different numbers of layers offloaded to the GPU
|
||||
|
||||
```sh
|
||||
$ ./llama-bench -ngl 10,20,30,31,32,33,34,35
|
||||
```
|
||||
|
||||
| model | size | params | backend | ngl | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | pp 512 | 373.36 ± 2.25 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | tg 128 | 13.45 ± 0.93 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | pp 512 | 472.65 ± 1.25 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | tg 128 | 21.36 ± 1.94 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | pp 512 | 631.87 ± 11.25 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | tg 128 | 40.04 ± 1.82 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | pp 512 | 657.89 ± 5.08 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | tg 128 | 48.19 ± 0.81 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | pp 512 | 688.26 ± 3.29 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | tg 128 | 54.78 ± 0.65 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | pp 512 | 704.27 ± 2.24 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | tg 128 | 60.62 ± 1.76 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | pp 512 | 881.34 ± 5.40 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | tg 128 | 71.76 ± 0.23 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 |
|
||||
|
||||
## Output formats
|
||||
|
||||
By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
|
||||
|
||||
### Markdown
|
||||
|
||||
```sh
|
||||
$ ./llama-bench -o md
|
||||
```
|
||||
|
||||
| model | size | params | backend | ngl | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ± 93.24 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ± 0.59 |
|
||||
|
||||
### CSV
|
||||
|
||||
```sh
|
||||
$ ./llama-bench -o csv
|
||||
```
|
||||
|
||||
```csv
|
||||
build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
|
||||
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
|
||||
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
|
||||
```
|
||||
|
||||
### JSON
|
||||
|
||||
```sh
|
||||
$ ./llama-bench -o json
|
||||
```
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"build_commit": "3469684",
|
||||
"build_number": 1275,
|
||||
"cuda": true,
|
||||
"opencl": false,
|
||||
"metal": false,
|
||||
"gpu_blas": true,
|
||||
"blas": true,
|
||||
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
|
||||
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
|
||||
"model_filename": "models/7B/ggml-model-q4_0.gguf",
|
||||
"model_type": "llama 7B mostly Q4_0",
|
||||
"model_size": 3825065984,
|
||||
"model_n_params": 6738415616,
|
||||
"n_batch": 512,
|
||||
"n_threads": 16,
|
||||
"f16_kv": true,
|
||||
"n_gpu_layers": 99,
|
||||
"main_gpu": 0,
|
||||
"mul_mat_q": true,
|
||||
"tensor_split": "0.00",
|
||||
"n_prompt": 512,
|
||||
"n_gen": 0,
|
||||
"test_time": "2023-09-23T12:09:57Z",
|
||||
"avg_ns": 212365953,
|
||||
"stddev_ns": 985423,
|
||||
"avg_ts": 2410.974041,
|
||||
"stddev_ts": 11.163766,
|
||||
"samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
|
||||
"samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
|
||||
},
|
||||
{
|
||||
"build_commit": "3469684",
|
||||
"build_number": 1275,
|
||||
"cuda": true,
|
||||
"opencl": false,
|
||||
"metal": false,
|
||||
"gpu_blas": true,
|
||||
"blas": true,
|
||||
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
|
||||
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
|
||||
"model_filename": "models/7B/ggml-model-q4_0.gguf",
|
||||
"model_type": "llama 7B mostly Q4_0",
|
||||
"model_size": 3825065984,
|
||||
"model_n_params": 6738415616,
|
||||
"n_batch": 512,
|
||||
"n_threads": 16,
|
||||
"f16_kv": true,
|
||||
"n_gpu_layers": 99,
|
||||
"main_gpu": 0,
|
||||
"mul_mat_q": true,
|
||||
"tensor_split": "0.00",
|
||||
"n_prompt": 0,
|
||||
"n_gen": 128,
|
||||
"test_time": "2023-09-23T12:09:59Z",
|
||||
"avg_ns": 977425219,
|
||||
"stddev_ns": 9268593,
|
||||
"avg_ts": 130.965708,
|
||||
"stddev_ts": 1.238924,
|
||||
"samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
|
||||
"samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### SQL
|
||||
|
||||
SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
|
||||
|
||||
```sh
|
||||
$ ./llama-bench -o sql
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE TABLE IF NOT EXISTS test (
|
||||
build_commit TEXT,
|
||||
build_number INTEGER,
|
||||
cuda INTEGER,
|
||||
opencl INTEGER,
|
||||
metal INTEGER,
|
||||
gpu_blas INTEGER,
|
||||
blas INTEGER,
|
||||
cpu_info TEXT,
|
||||
gpu_info TEXT,
|
||||
model_filename TEXT,
|
||||
model_type TEXT,
|
||||
model_size INTEGER,
|
||||
model_n_params INTEGER,
|
||||
n_batch INTEGER,
|
||||
n_threads INTEGER,
|
||||
f16_kv INTEGER,
|
||||
n_gpu_layers INTEGER,
|
||||
main_gpu INTEGER,
|
||||
mul_mat_q INTEGER,
|
||||
tensor_split TEXT,
|
||||
n_prompt INTEGER,
|
||||
n_gen INTEGER,
|
||||
test_time TEXT,
|
||||
avg_ns INTEGER,
|
||||
stddev_ns INTEGER,
|
||||
avg_ts REAL,
|
||||
stddev_ts REAL
|
||||
);
|
||||
|
||||
INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
|
||||
INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
|
||||
```
|
||||
@@ -74,14 +74,6 @@ static T stdev(const std::vector<T> & v) {
|
||||
return stdev;
|
||||
}
|
||||
|
||||
static bool ggml_cpu_has_metal() {
|
||||
#if defined(GGML_USE_METAL)
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::string get_cpu_info() {
|
||||
std::string id;
|
||||
#ifdef __linux__
|
||||
@@ -899,7 +891,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
|
||||
int n_processed = 0;
|
||||
while (n_processed < n_prompt) {
|
||||
int n_tokens = std::min(n_prompt - n_processed, n_batch);
|
||||
llama_eval(ctx, tokens.data(), n_tokens, n_past + n_processed, n_threads);
|
||||
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0), n_threads);
|
||||
n_processed += n_tokens;
|
||||
}
|
||||
}
|
||||
@@ -907,11 +899,11 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
|
||||
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
|
||||
llama_token token = llama_token_bos(ctx);
|
||||
for (int i = 0; i < n_gen; i++) {
|
||||
llama_eval(ctx, &token, 1, n_past + i, n_threads);
|
||||
llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0), n_threads);
|
||||
}
|
||||
}
|
||||
|
||||
static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) {
|
||||
static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
|
||||
(void) level;
|
||||
(void) text;
|
||||
(void) user_data;
|
||||
@@ -985,6 +977,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
test t(inst, lmodel, ctx);
|
||||
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
|
||||
// warmup run
|
||||
if (t.n_prompt > 0) {
|
||||
test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
|
||||
@@ -994,6 +988,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
for (int i = 0; i < params.reps; i++) {
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
|
||||
uint64_t t_start = get_time_ns();
|
||||
if (t.n_prompt > 0) {
|
||||
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
||||
|
||||
51
examples/main-cmake-pkg/.gitignore
vendored
Normal file
51
examples/main-cmake-pkg/.gitignore
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
# Prerequisites
|
||||
*.d
|
||||
|
||||
# Compiled Object files
|
||||
*.slo
|
||||
*.lo
|
||||
*.o
|
||||
*.obj
|
||||
|
||||
# Precompiled Headers
|
||||
*.gch
|
||||
*.pch
|
||||
|
||||
# Compiled Dynamic libraries
|
||||
*.so
|
||||
*.dylib
|
||||
*.dll
|
||||
|
||||
# Fortran module files
|
||||
*.mod
|
||||
*.smod
|
||||
|
||||
# Compiled Static libraries
|
||||
*.lai
|
||||
*.la
|
||||
*.a
|
||||
*.lib
|
||||
|
||||
# Executables
|
||||
*.exe
|
||||
*.out
|
||||
*.app
|
||||
|
||||
*.gguf
|
||||
|
||||
*.log
|
||||
.DS_Store
|
||||
.build/
|
||||
.cache/
|
||||
.direnv/
|
||||
.envrc
|
||||
.swiftpm
|
||||
.venv
|
||||
.clang-tidy
|
||||
.vs/
|
||||
.vscode/
|
||||
|
||||
build*/
|
||||
out/
|
||||
tmp/
|
||||
|
||||
36
examples/main-cmake-pkg/CMakeLists.txt
Normal file
36
examples/main-cmake-pkg/CMakeLists.txt
Normal file
@@ -0,0 +1,36 @@
|
||||
cmake_minimum_required(VERSION 3.12)
|
||||
project("main-cmake-pkg" C CXX)
|
||||
set(TARGET main-cmake-pkg)
|
||||
|
||||
find_package(Llama 0.0.1 REQUIRED)
|
||||
|
||||
# Bake common functionality in with target. Because applications
|
||||
# using the relocatable Llama package should be outside of the
|
||||
# source tree, main-cmake-pkg pretends the dependencies are built-in.
|
||||
|
||||
set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
|
||||
add_library(common OBJECT
|
||||
${_common_path}/common.h
|
||||
${_common_path}/common.cpp
|
||||
${_common_path}/console.h
|
||||
${_common_path}/console.cpp
|
||||
${_common_path}/grammar-parser.h
|
||||
${_common_path}/grammar-parser.cpp
|
||||
)
|
||||
|
||||
# WARNING: because build-info.h is auto-generated, it will only
|
||||
# be available after the user has built the llama.cpp sources.
|
||||
#
|
||||
configure_file(${_common_path}/../build-info.h
|
||||
${CMAKE_CURRENT_BINARY_DIR}/build-info.h
|
||||
COPYONLY)
|
||||
|
||||
target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR}
|
||||
${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
|
||||
target_include_directories(${TARGET} PRIVATE ${_common_path})
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
37
examples/main-cmake-pkg/README.md
Normal file
37
examples/main-cmake-pkg/README.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# llama.cpp/example/main-cmake-pkg
|
||||
|
||||
This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
|
||||
|
||||
## Building
|
||||
|
||||
Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
|
||||
|
||||
### Considerations
|
||||
|
||||
When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
|
||||
|
||||
### Build llama.cpp and install to C:\LlamaCPP directory
|
||||
|
||||
In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
|
||||
|
||||
```cmd
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
|
||||
cmake --build . --config Release
|
||||
cmake --install . --prefix C:/LlamaCPP
|
||||
```
|
||||
|
||||
### Build main-cmake-pkg
|
||||
|
||||
|
||||
```cmd
|
||||
cd ..\examples\main-cmake-pkg
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
|
||||
cmake --build . --config Release
|
||||
cmake --install . --prefix C:/MyLlamaApp
|
||||
```
|
||||
@@ -144,7 +144,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
|
||||
|
||||
Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
|
||||
|
||||
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
|
||||
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
|
||||
|
||||
### Keep Prompt
|
||||
|
||||
@@ -274,7 +274,7 @@ These options help improve the performance and memory usage of the LLaMA models.
|
||||
|
||||
### NUMA support
|
||||
|
||||
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.
|
||||
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
|
||||
|
||||
### Memory Float 32
|
||||
|
||||
@@ -302,7 +302,6 @@ These options provide extra functionality and customization when running the LLa
|
||||
|
||||
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
|
||||
- `--verbose-prompt`: Print the prompt before generating text.
|
||||
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
|
||||
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||
|
||||
@@ -41,7 +41,8 @@ static std::ostringstream * g_output_ss;
|
||||
static std::vector<llama_token> * g_output_tokens;
|
||||
static bool is_interacting = false;
|
||||
|
||||
void write_logfile(
|
||||
|
||||
static void write_logfile(
|
||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
||||
const std::vector<llama_token> & input_tokens, const std::string & output,
|
||||
const std::vector<llama_token> & output_tokens
|
||||
@@ -86,7 +87,7 @@ void write_logfile(
|
||||
}
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
void sigint_handler(int signo) {
|
||||
static void sigint_handler(int signo) {
|
||||
if (signo == SIGINT) {
|
||||
if (!is_interacting) {
|
||||
is_interacting = true;
|
||||
@@ -123,7 +124,7 @@ int main(int argc, char ** argv) {
|
||||
console::init(params.simple_io, params.use_color);
|
||||
atexit([]() { console::cleanup(); });
|
||||
|
||||
if (params.perplexity) {
|
||||
if (params.logits_all) {
|
||||
printf("\n************\n");
|
||||
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
||||
printf("************\n\n");
|
||||
@@ -148,6 +149,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
@@ -198,32 +200,6 @@ int main(int argc, char ** argv) {
|
||||
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
||||
}
|
||||
|
||||
// determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
|
||||
// uncomment the "used_mem" line in llama.cpp to see the results
|
||||
if (params.mem_test) {
|
||||
{
|
||||
LOG_TEE("%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
|
||||
|
||||
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
|
||||
llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// export the cgraph and exit
|
||||
if (params.export_cgraph) {
|
||||
llama_eval_export(ctx, "llama.ggml");
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::string path_session = params.path_prompt_cache;
|
||||
std::vector<llama_token> session_tokens;
|
||||
|
||||
@@ -523,18 +499,23 @@ int main(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
|
||||
const int n_left = n_past - params.n_keep;
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep);
|
||||
const int n_left = n_past - params.n_keep - 1;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
// always keep the first token - BOS
|
||||
n_past = std::max(1, params.n_keep);
|
||||
n_past_guidance = std::max(1, params.n_keep + guidance_offset);
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
if (ctx_guidance) {
|
||||
n_past_guidance -= n_discard;
|
||||
}
|
||||
|
||||
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
|
||||
|
||||
// insert n_left/2 tokens at the start of embd from last_tokens
|
||||
embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
|
||||
|
||||
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
|
||||
|
||||
LOG("clear session path\n");
|
||||
@@ -595,7 +576,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
for (int i = 0; i < input_size; i += params.n_batch) {
|
||||
int n_eval = std::min(input_size - i, params.n_batch);
|
||||
if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
|
||||
if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0), params.n_threads)) {
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -612,7 +593,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
|
||||
|
||||
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
|
||||
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0), params.n_threads)) {
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -1,22 +1,25 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
This script converts Hugging Face llama models to GGML and quantizes them.
|
||||
This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
|
||||
|
||||
Usage:
|
||||
python make-ggml.py --model {model_dir_or_hf_repo_name} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
|
||||
python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
|
||||
|
||||
Arguments:
|
||||
- --model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
|
||||
- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
|
||||
- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
|
||||
- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
|
||||
- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
|
||||
- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
|
||||
- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
|
||||
|
||||
Quant types:
|
||||
Old quant types (some base model types require these):
|
||||
- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
|
||||
- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
|
||||
- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
|
||||
- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
|
||||
|
||||
New quant types (recommended):
|
||||
- Q2_K: smallest, extreme quality loss - not recommended
|
||||
- Q3_K: alias for Q3_K_M
|
||||
- Q3_K_S: very small, very high quality loss
|
||||
@@ -40,9 +43,7 @@ import argparse
|
||||
import os
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
def main(model, outname, outdir, quants, keep_fp16):
|
||||
ggml_version = "v3"
|
||||
|
||||
def main(model, model_type, outname, outdir, quants, keep_fp16):
|
||||
if not os.path.isdir(model):
|
||||
print(f"Model not found at {model}. Downloading...")
|
||||
try:
|
||||
@@ -63,17 +64,20 @@ def main(model, outname, outdir, quants, keep_fp16):
|
||||
print("Building llama.cpp")
|
||||
subprocess.run(f"cd .. && make quantize", shell=True, check=True)
|
||||
|
||||
fp16 = f"{outdir}/{outname}.ggml{ggml_version}.fp16.bin"
|
||||
fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
|
||||
|
||||
print(f"Making unquantised GGML at {fp16}")
|
||||
print(f"Making unquantised GGUF at {fp16}")
|
||||
if not os.path.isfile(fp16):
|
||||
subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
|
||||
if model_type != "llama":
|
||||
subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
|
||||
else:
|
||||
subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
|
||||
else:
|
||||
print(f"Unquantised GGML already exists at: {fp16}")
|
||||
|
||||
print("Making quants")
|
||||
for type in quants:
|
||||
outfile = f"{outdir}/{outname}.ggml{ggml_version}.{type}.bin"
|
||||
outfile = f"{outdir}/{outname}.gguf.{type}.bin"
|
||||
print(f"Making {type} : {outfile}")
|
||||
subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
|
||||
|
||||
@@ -81,8 +85,9 @@ def main(model, outname, outdir, quants, keep_fp16):
|
||||
os.remove(fp16)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Convert/Quantize HF to GGML. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
|
||||
parser.add_argument('--model', required=True, help='Downloaded model dir or Hugging Face model repo name')
|
||||
parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
|
||||
parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
|
||||
parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
|
||||
parser.add_argument('--outname', default=None, help='Output model(s) name')
|
||||
parser.add_argument('--outdir', default=None, help='Output directory')
|
||||
parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
|
||||
@@ -90,4 +95,4 @@ if __name__ == "__main__":
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args.model, args.outname, args.outdir, args.quants, args.keep_fp16)
|
||||
main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)
|
||||
|
||||
8
examples/parallel/CMakeLists.txt
Normal file
8
examples/parallel/CMakeLists.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
set(TARGET parallel)
|
||||
add_executable(${TARGET} parallel.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
3
examples/parallel/README.md
Normal file
3
examples/parallel/README.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# llama.cpp/example/parallel
|
||||
|
||||
Simplified simluation for serving incoming requests in parallel
|
||||
380
examples/parallel/parallel.cpp
Normal file
380
examples/parallel/parallel.cpp
Normal file
@@ -0,0 +1,380 @@
|
||||
// A basic application simulating a server with multiple clients.
|
||||
// The clients submite requests to the server and they are processed in parallel.
|
||||
|
||||
#include "build-info.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// trim whitespace from the beginning and end of a string
|
||||
static std::string trim(const std::string & str) {
|
||||
size_t start = 0;
|
||||
size_t end = str.size();
|
||||
|
||||
while (start < end && isspace(str[start])) {
|
||||
start += 1;
|
||||
}
|
||||
|
||||
while (end > start && isspace(str[end - 1])) {
|
||||
end -= 1;
|
||||
}
|
||||
|
||||
return str.substr(start, end - start);
|
||||
}
|
||||
|
||||
static std::string k_system =
|
||||
R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
|
||||
The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
||||
|
||||
User: Recommend a nice restaurant in the area.
|
||||
Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
|
||||
User: Who is Richard Feynman?
|
||||
Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
|
||||
User:)";
|
||||
|
||||
static std::vector<std::string> k_prompts = {
|
||||
"What is the meaning of life?",
|
||||
"Tell me an interesting fact about llamas.",
|
||||
"What is the best way to cook a steak?",
|
||||
"Are you familiar with the Special Theory of Relativity and can you explain it to me?",
|
||||
"Recommend some interesting books to read.",
|
||||
"What is the best way to learn a new language?",
|
||||
"How to get a job at Google?",
|
||||
"If you could have any superpower, what would it be?",
|
||||
"I want to learn how to play the piano.",
|
||||
};
|
||||
|
||||
struct client {
|
||||
int32_t id = 0;
|
||||
|
||||
llama_seq_id seq_id = -1;
|
||||
|
||||
llama_token sampled;
|
||||
|
||||
int64_t t_start_prompt;
|
||||
int64_t t_start_gen;
|
||||
|
||||
int32_t n_prompt = 0;
|
||||
int32_t n_decoded = 0;
|
||||
int32_t i_batch = -1;
|
||||
|
||||
std::string input;
|
||||
std::string prompt;
|
||||
std::string response;
|
||||
|
||||
std::vector<llama_token> tokens_prev;
|
||||
};
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
srand(1234);
|
||||
|
||||
gpt_params params;
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// number of simultaneous "clients" to simulate
|
||||
const int32_t n_clients = params.n_parallel;
|
||||
|
||||
// requests to simulate
|
||||
const int32_t n_seq = params.n_sequences;
|
||||
|
||||
// insert new requests as soon as the previous one is done
|
||||
const bool cont_batching = params.cont_batching;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("parallel", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init(params.numa);
|
||||
|
||||
llama_model * model = NULL;
|
||||
llama_context * ctx = NULL;
|
||||
|
||||
// load the target model
|
||||
params.logits_all = true;
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
fflush(stderr);
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
std::vector<client> clients(n_clients);
|
||||
for (size_t i = 0; i < clients.size(); ++i) {
|
||||
auto & client = clients[i];
|
||||
client.id = i;
|
||||
client.tokens_prev.resize(std::max(256, params.n_predict));
|
||||
std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
|
||||
}
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
|
||||
std::vector<llama_token> tokens_system;
|
||||
tokens_system = ::llama_tokenize(ctx, k_system, true);
|
||||
const int32_t n_tokens_system = tokens_system.size();
|
||||
|
||||
llama_seq_id g_seq_id = 0;
|
||||
|
||||
// the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
|
||||
// users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
|
||||
llama_batch batch = llama_batch_init(params.n_ctx, 0);
|
||||
|
||||
int32_t n_total_prompt = 0;
|
||||
int32_t n_total_gen = 0;
|
||||
int32_t n_cache_miss = 0;
|
||||
|
||||
const auto t_main_start = ggml_time_us();
|
||||
|
||||
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
|
||||
LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||
LOG_TEE("\n");
|
||||
|
||||
{
|
||||
LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
|
||||
|
||||
batch.n_tokens = n_tokens_system;
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
batch.token[i] = tokens_system[i];
|
||||
batch.pos[i] = i;
|
||||
batch.seq_id[i] = 0;
|
||||
batch.logits[i] = false;
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch, params.n_threads) != 0) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// assign the system KV cache to all parallel sequences
|
||||
for (int32_t i = 1; i < n_clients; ++i) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
}
|
||||
|
||||
LOG_TEE("Processing requests ...\n\n");
|
||||
|
||||
while (true) {
|
||||
batch.n_tokens = 0;
|
||||
|
||||
// decode any currently ongoing sequences
|
||||
for (auto & client : clients) {
|
||||
if (client.seq_id == -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
batch.token [batch.n_tokens] = client.sampled;
|
||||
batch.pos [batch.n_tokens] = n_tokens_system + client.n_prompt + client.n_decoded;
|
||||
batch.seq_id[batch.n_tokens] = client.id;
|
||||
batch.logits[batch.n_tokens] = true;
|
||||
|
||||
client.n_decoded += 1;
|
||||
client.i_batch = batch.n_tokens;
|
||||
|
||||
batch.n_tokens += 1;
|
||||
}
|
||||
|
||||
if (batch.n_tokens == 0) {
|
||||
// all sequences have ended - clear the entire KV cache
|
||||
for (int i = 0; i < n_clients; ++i) {
|
||||
llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1);
|
||||
}
|
||||
|
||||
LOG_TEE("%s: clearing the KV cache\n", __func__);
|
||||
}
|
||||
|
||||
// insert new sequences for decoding
|
||||
if (cont_batching || batch.n_tokens == 0) {
|
||||
for (auto & client : clients) {
|
||||
if (client.seq_id == -1 && g_seq_id < n_seq) {
|
||||
client.seq_id = g_seq_id;
|
||||
|
||||
client.t_start_prompt = ggml_time_us();
|
||||
client.t_start_gen = 0;
|
||||
|
||||
client.input = k_prompts[rand() % k_prompts.size()];
|
||||
client.prompt = client.input + "\nAssistant:";
|
||||
client.response = "";
|
||||
|
||||
std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
|
||||
|
||||
// do not prepend BOS because we have a system prompt!
|
||||
std::vector<llama_token> tokens_prompt;
|
||||
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
|
||||
|
||||
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
|
||||
batch.token [batch.n_tokens] = tokens_prompt[i];
|
||||
batch.pos [batch.n_tokens] = i + n_tokens_system;
|
||||
batch.seq_id[batch.n_tokens] = client.id;
|
||||
batch.logits[batch.n_tokens] = false;
|
||||
batch.n_tokens += 1;
|
||||
}
|
||||
|
||||
// extract the logits only for the last token
|
||||
if (batch.n_tokens > 0) {
|
||||
batch.logits[batch.n_tokens - 1] = true;
|
||||
}
|
||||
|
||||
client.n_prompt = tokens_prompt.size();
|
||||
client.n_decoded = 0;
|
||||
client.i_batch = batch.n_tokens - 1;
|
||||
|
||||
LOG_TEE("\033[1mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
||||
|
||||
g_seq_id += 1;
|
||||
|
||||
// insert new requests one-by-one
|
||||
//if (cont_batching) {
|
||||
// break;
|
||||
//}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (batch.n_tokens == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
// process in chunks of params.n_batch
|
||||
int32_t n_batch = params.n_batch;
|
||||
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||
// experiment: process in powers of 2
|
||||
//if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
|
||||
// n_batch /= 2;
|
||||
// i -= n_batch;
|
||||
// continue;
|
||||
//}
|
||||
|
||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
|
||||
const int ret = llama_decode(ctx, batch_view, params.n_threads);
|
||||
if (ret != 0) {
|
||||
if (n_batch == 1 || ret < 0) {
|
||||
// if you get here, it means the KV cache is full - try increasing it via the context size
|
||||
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
||||
|
||||
n_cache_miss += 1;
|
||||
|
||||
// retry with half the batch size to try to find a free slot in the KV cache
|
||||
n_batch /= 2;
|
||||
i -= n_batch;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
|
||||
|
||||
for (auto & client : clients) {
|
||||
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
||||
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
||||
|
||||
const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);
|
||||
|
||||
if (client.n_decoded == 1) {
|
||||
// start measuring generation time after the first token to make sure all concurrent clients
|
||||
// have their prompt already processed
|
||||
client.t_start_gen = ggml_time_us();
|
||||
}
|
||||
|
||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||
client.tokens_prev.erase(client.tokens_prev.begin());
|
||||
client.tokens_prev.push_back(id);
|
||||
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
client.response += token_str;
|
||||
client.sampled = id;
|
||||
|
||||
//printf("client %d, seq %d, token %d, pos %d, batch %d: %s\n",
|
||||
// client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
|
||||
|
||||
if (client.n_decoded > 2 &&
|
||||
(id == llama_token_eos(ctx) ||
|
||||
(params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
|
||||
client.response.find("User:") != std::string::npos ||
|
||||
client.response.find('\n') != std::string::npos)) {
|
||||
// basic reverse prompt
|
||||
const size_t pos = client.response.find("User:");
|
||||
if (pos != std::string::npos) {
|
||||
client.response = client.response.substr(0, pos);
|
||||
}
|
||||
|
||||
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
||||
llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, n_ctx);
|
||||
|
||||
const auto t_main_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\nResponse: %s\n\n",
|
||||
client.id, client.seq_id, client.n_prompt, client.n_decoded,
|
||||
(t_main_end - client.t_start_prompt) / 1e6,
|
||||
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
||||
n_cache_miss,
|
||||
::trim(client.input).c_str(),
|
||||
::trim(client.response).c_str());
|
||||
|
||||
n_total_prompt += client.n_prompt;
|
||||
n_total_gen += client.n_decoded;
|
||||
|
||||
client.seq_id = -1;
|
||||
}
|
||||
|
||||
client.i_batch = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const auto t_main_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("\n\n");
|
||||
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
|
||||
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
llama_print_timings(ctx);
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,3 +1,21 @@
|
||||
# perplexity
|
||||
|
||||
TODO
|
||||
|
||||
## Llama 2 70B Scorechart
|
||||
Quantization | Model size (GiB) | Perplexity | Delta to fp16
|
||||
-- | -- | -- | --
|
||||
Q4_0 | 36.20 | 3.5550 | 3.61%
|
||||
Q4_1 | 40.20 | 3.5125 | 2.37%
|
||||
Q5_0 | 44.20 | 3.4744 | 1.26%
|
||||
Q2_K | 27.27 | 3.7339 | 8.82%
|
||||
Q3_K_S | 27.86 | 3.7019 | 7.89%
|
||||
Q3_K_M | 30.83 | 3.5932 | 4.72%
|
||||
Q3_K_L | 33.67 | 3.5617 | 3.80%
|
||||
Q4_K_S | 36.39 | 3.4852 | 1.57%
|
||||
Q4_K_M | 38.54 | 3.4725 | 1.20%
|
||||
Q5_K_S | 44.20 | 3.4483 | 0.50%
|
||||
Q5_K_M | 45.41 | 3.4451 | 0.40%
|
||||
Q6_K | 52.70 | 3.4367 | 0.16%
|
||||
fp16 | 128.5 | 3.4313 | -
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
@@ -28,9 +28,10 @@ struct results_log_softmax {
|
||||
float prob;
|
||||
};
|
||||
|
||||
void write_logfile(const llama_context * ctx, const gpt_params & params,
|
||||
const llama_model * model, const struct results_perplexity & results) {
|
||||
|
||||
static void write_logfile(
|
||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
||||
const struct results_perplexity & results
|
||||
) {
|
||||
if (params.logdir.empty()) {
|
||||
return;
|
||||
}
|
||||
@@ -76,10 +77,12 @@ void write_logfile(const llama_context * ctx, const gpt_params & params,
|
||||
fclose(logfile);
|
||||
}
|
||||
|
||||
std::vector<float> softmax(const std::vector<float>& logits) {
|
||||
static std::vector<float> softmax(const std::vector<float>& logits) {
|
||||
std::vector<float> probs(logits.size());
|
||||
float max_logit = logits[0];
|
||||
for (float v : logits) max_logit = std::max(max_logit, v);
|
||||
for (float v : logits) {
|
||||
max_logit = std::max(max_logit, v);
|
||||
}
|
||||
double sum_exp = 0.0;
|
||||
for (size_t i = 0; i < logits.size(); i++) {
|
||||
// Subtract the maximum logit value from the current logit value for numerical stability
|
||||
@@ -88,25 +91,33 @@ std::vector<float> softmax(const std::vector<float>& logits) {
|
||||
sum_exp += exp_logit;
|
||||
probs[i] = exp_logit;
|
||||
}
|
||||
for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
|
||||
for (size_t i = 0; i < probs.size(); i++) {
|
||||
probs[i] /= sum_exp;
|
||||
}
|
||||
return probs;
|
||||
}
|
||||
|
||||
results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
|
||||
static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
|
||||
float max_logit = logits[0];
|
||||
for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
|
||||
for (int i = 1; i < n_vocab; ++i) {
|
||||
max_logit = std::max(max_logit, logits[i]);
|
||||
}
|
||||
double sum_exp = 0.0;
|
||||
for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
sum_exp += expf(logits[i] - max_logit);
|
||||
}
|
||||
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
|
||||
}
|
||||
|
||||
void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
|
||||
double & nll, double & nll2, float * logit_history, float * prob_history) {
|
||||
|
||||
static void process_logits(
|
||||
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
|
||||
double & nll, double & nll2, float * logit_history, float * prob_history
|
||||
) {
|
||||
std::mutex mutex;
|
||||
int counter = 0;
|
||||
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
|
||||
double local_nll = 0, local_nll2 = 0;
|
||||
double local_nll = 0;
|
||||
double local_nll2 = 0;
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
int i = counter++;
|
||||
@@ -124,13 +135,16 @@ void process_logits(int n_vocab, const float * logits, const int * tokens, int n
|
||||
prob_history[i] = results.prob;
|
||||
}
|
||||
};
|
||||
for (auto & w : workers) w = std::thread(compute);
|
||||
for (auto & w : workers) {
|
||||
w = std::thread(compute);
|
||||
}
|
||||
compute();
|
||||
for (auto & w : workers) w.join();
|
||||
|
||||
for (auto & w : workers) {
|
||||
w.join();
|
||||
}
|
||||
}
|
||||
|
||||
results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
// Output: `perplexity: 13.5106 [114/114]`
|
||||
@@ -150,8 +164,8 @@ results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params)
|
||||
return {std::move(tokens), 0., {}, {}};
|
||||
}
|
||||
|
||||
std::vector<float> logit_history;
|
||||
std::vector<float> prob_history;
|
||||
std::vector<float> logit_history;
|
||||
std::vector<float> prob_history;
|
||||
|
||||
logit_history.resize(tokens.size());
|
||||
prob_history.resize(tokens.size());
|
||||
@@ -193,12 +207,15 @@ results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params)
|
||||
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
const int batch_size = std::min(end - batch_start, n_batch);
|
||||
|
||||
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
|
||||
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0), params.n_threads)) {
|
||||
//fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return {tokens, -1, logit_history, prob_history};
|
||||
}
|
||||
@@ -260,8 +277,7 @@ results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params)
|
||||
return {tokens, std::exp(nll / count), logit_history, prob_history};
|
||||
}
|
||||
|
||||
results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
if (params.ppl_stride > 0) {
|
||||
return perplexity_v2(ctx, params);
|
||||
}
|
||||
@@ -319,6 +335,9 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
const int batch_size = std::min(end - batch_start, n_batch);
|
||||
@@ -331,7 +350,7 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
tokens[batch_start] = llama_token_bos(ctx);
|
||||
}
|
||||
|
||||
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0), params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return {tokens, -1, logit_history, prob_history};
|
||||
}
|
||||
@@ -400,15 +419,16 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
return {tokens, ppl, logit_history, prob_history};
|
||||
}
|
||||
|
||||
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
|
||||
int n_vocab, int n_thread) {
|
||||
static std::vector<float> hellaswag_evaluate_tokens(
|
||||
llama_context * ctx, std::vector<int> & tokens, int n_past, int n_batch, int n_vocab, int n_thread
|
||||
) {
|
||||
std::vector<float> result;
|
||||
result.reserve(tokens.size() * n_vocab);
|
||||
size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
|
||||
for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
|
||||
size_t n_tokens = tokens.size() - i_chunk * n_batch;
|
||||
n_tokens = std::min(n_tokens, size_t(n_batch));
|
||||
if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) {
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0), n_thread)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return {};
|
||||
}
|
||||
@@ -421,7 +441,7 @@ std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vec
|
||||
return result;
|
||||
}
|
||||
|
||||
void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
// Calculates hellaswag score (acc_norm) from prompt
|
||||
//
|
||||
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
|
||||
@@ -548,6 +568,9 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
query_embd.resize(32);
|
||||
}
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
|
||||
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
|
||||
if (logits.empty()) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
@@ -659,7 +682,7 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
params.perplexity = true;
|
||||
params.logits_all = true;
|
||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||
|
||||
if (params.ppl_stride > 0) {
|
||||
@@ -668,7 +691,7 @@ int main(int argc, char ** argv) {
|
||||
params.n_ctx += params.ppl_stride/2;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
print_build_info();
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
|
||||
@@ -2,4 +2,5 @@ set(TARGET quantize-stats)
|
||||
add_executable(${TARGET} quantize-stats.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "ggml.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#define LLAMA_API_INTERNAL
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
@@ -34,8 +34,8 @@ struct quantize_stats_params {
|
||||
std::vector<enum ggml_type> include_types;
|
||||
};
|
||||
|
||||
const size_t HISTOGRAM_BUCKETS = 150;
|
||||
const double HISTOGRAM_RANGE = 0.03;
|
||||
constexpr size_t HISTOGRAM_BUCKETS = 150;
|
||||
constexpr double HISTOGRAM_RANGE = 0.03;
|
||||
|
||||
struct error_stats {
|
||||
size_t num_samples;
|
||||
@@ -44,8 +44,7 @@ struct error_stats {
|
||||
uint64_t error_histogram[HISTOGRAM_BUCKETS];
|
||||
};
|
||||
|
||||
|
||||
void quantize_stats_print_usage(int /*argc*/, char ** argv) {
|
||||
static void quantize_stats_print_usage(int /*argc*/, char ** argv) {
|
||||
quantize_stats_params params;
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
@@ -71,7 +70,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
|
||||
}
|
||||
|
||||
// Check if a layer is included/excluded by command line
|
||||
bool layer_included(const quantize_stats_params & params, const std::string & layer) {
|
||||
static bool layer_included(const quantize_stats_params & params, const std::string & layer) {
|
||||
for (const auto& excluded : params.exclude_layers) {
|
||||
if (std::regex_search(layer, std::regex(excluded))) {
|
||||
return false;
|
||||
@@ -86,7 +85,7 @@ bool layer_included(const quantize_stats_params & params, const std::string & la
|
||||
}
|
||||
|
||||
// Update error statistics given vectors with the before/after result of quantization
|
||||
void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
|
||||
static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
|
||||
for (int64_t i = 0; i < nelements; i++) {
|
||||
double diff = input[i] - output[i];
|
||||
stats.total_error += diff * diff;
|
||||
@@ -96,14 +95,14 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou
|
||||
stats.num_samples += nelements;
|
||||
}
|
||||
|
||||
void combine_error_stats(error_stats & into, const error_stats & from) {
|
||||
static void combine_error_stats(error_stats & into, const error_stats & from) {
|
||||
into.num_samples += from.num_samples;
|
||||
into.total_error += from.total_error;
|
||||
if (from.max_error > into.max_error) into.max_error = from.max_error;
|
||||
for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
|
||||
}
|
||||
|
||||
double find_quantile(const error_stats & stats, double quantile) {
|
||||
static double find_quantile(const error_stats & stats, double quantile) {
|
||||
double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
|
||||
|
||||
double accum = 0;
|
||||
@@ -116,7 +115,7 @@ double find_quantile(const error_stats & stats, double quantile) {
|
||||
return INFINITY;
|
||||
}
|
||||
|
||||
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
|
||||
static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
|
||||
double rmse = sqrt(stats.total_error / (double) stats.num_samples);
|
||||
double median = find_quantile(stats, .5);
|
||||
double pct95 = find_quantile(stats, .95);
|
||||
@@ -143,17 +142,10 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
|
||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||
}
|
||||
|
||||
void test_roundtrip_on_chunk(
|
||||
const ggml_tensor * layer,
|
||||
int64_t offset,
|
||||
int64_t chunk_size,
|
||||
const ggml_type_traits_t & qfns,
|
||||
bool use_reference,
|
||||
float * input_scratch,
|
||||
char * quantized_scratch,
|
||||
float * output_scratch,
|
||||
error_stats & stats) {
|
||||
|
||||
static void test_roundtrip_on_chunk(
|
||||
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
|
||||
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
|
||||
) {
|
||||
if (layer->type == GGML_TYPE_F16) {
|
||||
for (int i = 0; i < chunk_size; i++) {
|
||||
input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
|
||||
@@ -174,18 +166,11 @@ void test_roundtrip_on_chunk(
|
||||
|
||||
|
||||
// Run quantization function for a single layer and update error stats
|
||||
void test_roundtrip_on_layer(
|
||||
std::string & name,
|
||||
bool print_layer_stats,
|
||||
const ggml_type_traits_t & qfns,
|
||||
bool use_reference,
|
||||
const ggml_tensor * layer,
|
||||
std::vector<float> & input_scratch,
|
||||
std::vector<char> & quantized_scratch,
|
||||
std::vector<float> & output_scratch,
|
||||
error_stats & total_error,
|
||||
int max_thread = 0) {
|
||||
|
||||
static void test_roundtrip_on_layer(
|
||||
std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
|
||||
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
|
||||
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
|
||||
) {
|
||||
assert(tensor_is_contiguous(layer));
|
||||
error_stats layer_error {};
|
||||
uint64_t nelements = ggml_nelements(layer);
|
||||
@@ -314,7 +299,7 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
print_build_info();
|
||||
|
||||
// load the model
|
||||
fprintf(stderr, "Loading model\n");
|
||||
|
||||
@@ -2,6 +2,7 @@ set(TARGET quantize)
|
||||
add_executable(${TARGET} quantize.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
|
||||
@@ -1,3 +1,44 @@
|
||||
# quantize
|
||||
|
||||
TODO
|
||||
|
||||
## Llama 2 7B
|
||||
|
||||
Quantization | Bits per Weight (BPW)
|
||||
-- | --
|
||||
Q2_K | 3.35
|
||||
Q3_K_S | 3.50
|
||||
Q3_K_M | 3.91
|
||||
Q3_K_L | 4.27
|
||||
Q4_K_S | 4.58
|
||||
Q4_K_M | 4.84
|
||||
Q5_K_S | 5.52
|
||||
Q5_K_M | 5.68
|
||||
Q6_K | 6.56
|
||||
|
||||
## Llama 2 13B
|
||||
Quantization | Bits per Weight (BPW)
|
||||
-- | --
|
||||
Q2_K | 3.34
|
||||
Q3_K_S | 3.48
|
||||
Q3_K_M | 3.89
|
||||
Q3_K_L | 4.26
|
||||
Q4_K_S | 4.56
|
||||
Q4_K_M | 4.83
|
||||
Q5_K_S | 5.51
|
||||
Q5_K_M | 5.67
|
||||
Q6_K | 6.56
|
||||
|
||||
# Llama 2 70B
|
||||
|
||||
Quantization | Bits per Weight (BPW)
|
||||
-- | --
|
||||
Q2_K | 3.40
|
||||
Q3_K_S | 3.47
|
||||
Q3_K_M | 3.85
|
||||
Q3_K_L | 4.19
|
||||
Q4_K_S | 4.53
|
||||
Q4_K_M | 4.80
|
||||
Q5_K_S | 5.50
|
||||
Q5_K_M | 5.65
|
||||
Q6_K | 6.56
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#include "build-info.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
@@ -40,7 +40,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
};
|
||||
|
||||
|
||||
bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||
std::string ftype_str;
|
||||
|
||||
for (auto ch : ftype_str_in) {
|
||||
@@ -72,7 +72,7 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
|
||||
// usage:
|
||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||
//
|
||||
void usage(const char * executable) {
|
||||
static void usage(const char * executable) {
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
@@ -161,7 +161,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
print_build_info();
|
||||
|
||||
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
|
||||
if (params.nthread > 0) {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
@@ -17,7 +17,7 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
print_build_info();
|
||||
|
||||
if (params.n_predict < 0) {
|
||||
params.n_predict = 16;
|
||||
@@ -35,11 +35,11 @@ int main(int argc, char ** argv) {
|
||||
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
|
||||
|
||||
// init
|
||||
auto model = llama_load_model_from_file(params.model.c_str(), lparams);
|
||||
auto * model = llama_load_model_from_file(params.model.c_str(), lparams);
|
||||
if (model == nullptr) {
|
||||
return 1;
|
||||
}
|
||||
auto ctx = llama_new_context_with_model(model, lparams);
|
||||
auto * ctx = llama_new_context_with_model(model, lparams);
|
||||
if (ctx == nullptr) {
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
@@ -54,7 +54,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// evaluate prompt
|
||||
llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
|
||||
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt_tokens, n_past, 0), params.n_threads);
|
||||
|
||||
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
|
||||
n_past += n_prompt_tokens;
|
||||
@@ -78,7 +78,7 @@ int main(int argc, char ** argv) {
|
||||
printf("\n%s", params.prompt.c_str());
|
||||
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto logits = llama_get_logits(ctx);
|
||||
auto * logits = llama_get_logits(ctx);
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
@@ -91,7 +91,7 @@ int main(int argc, char ** argv) {
|
||||
last_n_tokens_data.push_back(next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
||||
if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0), params.n_threads)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
@@ -106,7 +106,7 @@ int main(int argc, char ** argv) {
|
||||
llama_free(ctx);
|
||||
|
||||
// make new context
|
||||
auto ctx2 = llama_new_context_with_model(model, lparams);
|
||||
auto * ctx2 = llama_new_context_with_model(model, lparams);
|
||||
|
||||
// Load state (rng, logits, embedding and kv_cache) from file
|
||||
{
|
||||
@@ -138,7 +138,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// second run
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto logits = llama_get_logits(ctx2);
|
||||
auto * logits = llama_get_logits(ctx2);
|
||||
auto n_vocab = llama_n_vocab(ctx2);
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
@@ -151,7 +151,7 @@ int main(int argc, char ** argv) {
|
||||
last_n_tokens_data.push_back(next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
||||
if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0), params.n_threads)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_free(ctx2);
|
||||
llama_free_model(model);
|
||||
|
||||
@@ -381,6 +381,10 @@ struct llama_server_context
|
||||
|
||||
// compare the evaluated prompt with the new prompt
|
||||
n_past = common_part(embd, prompt_tokens);
|
||||
|
||||
// since #3228 we now have to manually manage the KV cache
|
||||
llama_kv_cache_seq_rm(ctx, 0, n_past, params.n_ctx);
|
||||
|
||||
embd = prompt_tokens;
|
||||
if (n_past == num_prompt_tokens)
|
||||
{
|
||||
@@ -411,19 +415,27 @@ struct llama_server_context
|
||||
|
||||
if (embd.size() >= (size_t)params.n_ctx)
|
||||
{
|
||||
// Reset context
|
||||
const int n_left = (params.n_ctx - params.n_keep) / 2;
|
||||
// Shift context
|
||||
|
||||
const int n_left = n_past - params.n_keep - 1;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
|
||||
for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
|
||||
{
|
||||
embd[i - n_discard] = embd[i];
|
||||
}
|
||||
embd.resize(embd.size() - n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
std::vector<llama_token> new_tokens(embd.begin(), embd.begin() + params.n_keep);
|
||||
new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end());
|
||||
embd = new_tokens;
|
||||
n_past = params.n_keep;
|
||||
truncated = true;
|
||||
LOG_VERBOSE("input truncated", {
|
||||
{"n_ctx", params.n_ctx},
|
||||
{"n_keep", params.n_keep},
|
||||
{"n_left", n_left},
|
||||
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
|
||||
});
|
||||
}
|
||||
|
||||
@@ -434,7 +446,8 @@ struct llama_server_context
|
||||
{
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads))
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0), params.n_threads))
|
||||
{
|
||||
LOG_ERROR("failed to eval", {
|
||||
{"n_eval", n_eval},
|
||||
@@ -523,13 +536,13 @@ struct llama_server_context
|
||||
{
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
const int mirostat_m = 100;
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
llama_sample_temp(ctx, &candidates_p, temp);
|
||||
result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
||||
}
|
||||
else if (mirostat == 2)
|
||||
{
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
llama_sample_temp(ctx, &candidates_p, temp);
|
||||
result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||
}
|
||||
else
|
||||
@@ -540,7 +553,7 @@ struct llama_server_context
|
||||
llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
|
||||
llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
|
||||
llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
llama_sample_temp(ctx, &candidates_p, temp);
|
||||
result.tok = llama_sample_token(ctx, &candidates_p);
|
||||
}
|
||||
}
|
||||
@@ -701,8 +714,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
@@ -1083,8 +1096,9 @@ static json format_final_response(llama_server_context &llama, const std::string
|
||||
return res;
|
||||
}
|
||||
|
||||
static json format_partial_response(llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs)
|
||||
{
|
||||
static json format_partial_response(
|
||||
llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs
|
||||
) {
|
||||
json res = json{
|
||||
{"content", content},
|
||||
{"stop", false},
|
||||
@@ -1215,7 +1229,7 @@ static void log_server_request(const Request &req, const Response &res)
|
||||
});
|
||||
}
|
||||
|
||||
bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) {
|
||||
static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
|
||||
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
|
||||
}
|
||||
|
||||
@@ -1225,7 +1239,7 @@ bool is_at_eob(llama_server_context & server_context, const llama_token * tokens
|
||||
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
||||
// This is also called when the stop condition is met.
|
||||
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
||||
void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
|
||||
static void beam_search_callback(void *callback_data, llama_beams_state beams_state) {
|
||||
auto & llama = *static_cast<llama_server_context*>(callback_data);
|
||||
// Mark beams as EOS as needed.
|
||||
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
||||
@@ -1258,7 +1272,8 @@ struct token_translator {
|
||||
std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
|
||||
};
|
||||
|
||||
void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
|
||||
static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama)
|
||||
{
|
||||
auto & gtps = llama.generated_token_probs;
|
||||
auto translator = token_translator{llama.ctx};
|
||||
auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} simple.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
21
examples/simple/README.md
Normal file
21
examples/simple/README.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# llama.cpp/example/simple
|
||||
|
||||
The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
|
||||
|
||||
```bash
|
||||
./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
|
||||
|
||||
...
|
||||
|
||||
main: n_len = 32, n_ctx = 2048, n_parallel = 1, n_kv_req = 32
|
||||
|
||||
Hello my name is Shawn and I'm a 20 year old male from the United States. I'm a 20 year old
|
||||
|
||||
main: decoded 27 tokens in 2.31 s, speed: 11.68 t/s
|
||||
|
||||
llama_print_timings: load time = 579.15 ms
|
||||
llama_print_timings: sample time = 0.72 ms / 28 runs ( 0.03 ms per token, 38888.89 tokens per second)
|
||||
llama_print_timings: prompt eval time = 655.63 ms / 10 tokens ( 65.56 ms per token, 15.25 tokens per second)
|
||||
llama_print_timings: eval time = 2180.97 ms / 27 runs ( 80.78 ms per token, 12.38 tokens per second)
|
||||
llama_print_timings: total time = 2891.13 ms
|
||||
```
|
||||
@@ -1,5 +1,3 @@
|
||||
#include "build-info.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -28,12 +26,18 @@ int main(int argc, char ** argv) {
|
||||
params.prompt = "Hello my name is";
|
||||
}
|
||||
|
||||
// total length of the sequence including the prompt
|
||||
const int n_len = 32;
|
||||
|
||||
// init LLM
|
||||
|
||||
llama_backend_init(params.numa);
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
|
||||
ctx_params.seed = 1234;
|
||||
ctx_params.n_ctx = 2048;
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
|
||||
|
||||
if (model == NULL) {
|
||||
@@ -43,20 +47,31 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// tokenize the prompt
|
||||
|
||||
std::vector<llama_token> tokens_list;
|
||||
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const int max_context_size = llama_n_ctx(ctx);
|
||||
const int max_tokens_list_size = max_context_size - 4;
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
|
||||
|
||||
if ((int) tokens_list.size() > max_tokens_list_size) {
|
||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
|
||||
|
||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||
if (n_kv_req > n_ctx) {
|
||||
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
||||
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
// print the prompt token-by-token
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
for (auto id : tokens_list) {
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||
@@ -64,63 +79,104 @@ int main(int argc, char ** argv) {
|
||||
|
||||
fflush(stderr);
|
||||
|
||||
// create a llama_batch with size 512
|
||||
// we use this object to submit token data for decoding
|
||||
|
||||
llama_batch batch = llama_batch_init(512, 0);
|
||||
|
||||
// evaluate the initial prompt
|
||||
batch.n_tokens = tokens_list.size();
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
batch.token[i] = tokens_list[i];
|
||||
batch.pos[i] = i;
|
||||
batch.seq_id[i] = 0;
|
||||
batch.logits[i] = false;
|
||||
}
|
||||
|
||||
// llama_decode will output logits only for the last token of the prompt
|
||||
batch.logits[batch.n_tokens - 1] = true;
|
||||
|
||||
if (llama_decode(ctx, batch, params.n_threads) != 0) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// main loop
|
||||
|
||||
// The LLM keeps a contextual cache memory of previous token evaluation.
|
||||
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
|
||||
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
|
||||
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
|
||||
int n_cur = batch.n_tokens;
|
||||
int n_decode = 0;
|
||||
|
||||
const int n_gen = std::min(32, max_context_size);
|
||||
const auto t_main_start = ggml_time_us();
|
||||
|
||||
while (llama_get_kv_cache_token_count(ctx) < n_gen) {
|
||||
// evaluate the transformer
|
||||
while (n_cur <= n_len) {
|
||||
// sample the next token
|
||||
{
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
|
||||
|
||||
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
// sample the most likely token
|
||||
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
|
||||
// is it an end of stream?
|
||||
if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
|
||||
LOG_TEE("\n");
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||
fflush(stdout);
|
||||
|
||||
// prepare the next batch
|
||||
batch.n_tokens = 0;
|
||||
|
||||
// push this new token for next evaluation
|
||||
batch.token [batch.n_tokens] = new_token_id;
|
||||
batch.pos [batch.n_tokens] = n_cur;
|
||||
batch.seq_id[batch.n_tokens] = 0;
|
||||
batch.logits[batch.n_tokens] = true;
|
||||
|
||||
batch.n_tokens += 1;
|
||||
|
||||
n_decode += 1;
|
||||
}
|
||||
|
||||
n_cur += 1;
|
||||
|
||||
// evaluate the current batch with the transformer model
|
||||
if (llama_decode(ctx, batch, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
||||
return 1;
|
||||
}
|
||||
|
||||
tokens_list.clear();
|
||||
|
||||
// sample the next token
|
||||
|
||||
llama_token new_token_id = 0;
|
||||
|
||||
auto logits = llama_get_logits(ctx);
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
|
||||
|
||||
// is it an end of stream ?
|
||||
if (new_token_id == llama_token_eos(ctx)) {
|
||||
fprintf(stderr, " [end of text]\n");
|
||||
break;
|
||||
}
|
||||
|
||||
// print the new token :
|
||||
printf("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||
fflush(stdout);
|
||||
|
||||
// push this new token for next evaluation
|
||||
tokens_list.push_back(new_token_id);
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
|
||||
const auto t_main_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||
|
||||
llama_print_timings(ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -37,7 +37,7 @@ int main(int argc, char ** argv) {
|
||||
llama_context * ctx_dft = NULL;
|
||||
|
||||
// load the target model
|
||||
params.perplexity = true; // HACK: enable logits_all = true
|
||||
params.logits_all = true;
|
||||
std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
|
||||
|
||||
// load the draft model
|
||||
@@ -70,9 +70,9 @@ int main(int argc, char ** argv) {
|
||||
const auto t_enc_start = ggml_time_us();
|
||||
|
||||
// eval the prompt with both models
|
||||
llama_eval(ctx_tgt, inp.data(), int(inp.size() - 1), 0, params.n_threads);
|
||||
llama_eval(ctx_tgt, &inp.back(), 1, inp.size() - 1, params.n_threads);
|
||||
llama_eval(ctx_dft, inp.data(), int(inp.size()), 0, params.n_threads);
|
||||
llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0, 0), params.n_threads);
|
||||
llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0), params.n_threads);
|
||||
llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0), params.n_threads);
|
||||
|
||||
const auto t_enc_end = ggml_time_us();
|
||||
|
||||
@@ -82,7 +82,7 @@ int main(int argc, char ** argv) {
|
||||
//GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
|
||||
|
||||
// how many tokens to draft each time
|
||||
const int n_draft = params.n_draft;
|
||||
int n_draft = params.n_draft;
|
||||
|
||||
int n_predict = 0;
|
||||
int n_drafted = 0;
|
||||
@@ -131,9 +131,10 @@ int main(int argc, char ** argv) {
|
||||
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
|
||||
|
||||
int i_dft = 0;
|
||||
|
||||
while (true) {
|
||||
// sample from the target model
|
||||
const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
|
||||
llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
|
||||
|
||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||
last_tokens.erase(last_tokens.begin());
|
||||
@@ -171,9 +172,31 @@ int main(int argc, char ** argv) {
|
||||
LOG("out of drafted tokens\n");
|
||||
}
|
||||
|
||||
llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
|
||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, n_ctx);
|
||||
llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0), params.n_threads);
|
||||
++n_past_dft;
|
||||
|
||||
// heuristic for n_draft
|
||||
{
|
||||
const int n_draft_cur = (int) drafted.size();
|
||||
const bool all_accepted = i_dft == n_draft_cur;
|
||||
|
||||
LOG("n_draft = %d\n", n_draft);
|
||||
LOG("n_draft_cur = %d\n", n_draft_cur);
|
||||
LOG("i_dft = %d\n", i_dft);
|
||||
LOG("all_accepted = %d\n", all_accepted);
|
||||
|
||||
if (all_accepted && n_draft == n_draft_cur) {
|
||||
LOG(" - max drafted tokens accepted - n_draft += 8\n");
|
||||
n_draft = std::min(30, n_draft + 8);
|
||||
} else if (all_accepted) {
|
||||
LOG(" - partially drafted tokens accepted - no change\n");
|
||||
} else {
|
||||
LOG(" - drafted token rejected - n_draft -= 1\n");
|
||||
n_draft = std::max(2, n_draft - 1);
|
||||
}
|
||||
}
|
||||
|
||||
drafted.clear();
|
||||
drafted.push_back(id);
|
||||
|
||||
@@ -234,7 +257,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// evaluate the drafted token on the draft model
|
||||
llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads);
|
||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, n_ctx);
|
||||
llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0), params.n_threads);
|
||||
++n_past_cur;
|
||||
|
||||
if (grammar_dft != NULL) {
|
||||
@@ -243,7 +267,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// evaluate the target model on the drafted tokens
|
||||
llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads);
|
||||
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_ctx);
|
||||
llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0), params.n_threads);
|
||||
++n_past_tgt;
|
||||
|
||||
// the first token is always proposed by the traget model before the speculation loop
|
||||
|
||||
@@ -679,15 +679,23 @@ struct ggml_tensor * llama_build_train_graphs(
|
||||
}
|
||||
};
|
||||
|
||||
// KQ_pos - contains the positions
|
||||
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
|
||||
{
|
||||
int * data = (int *) KQ_pos->data;
|
||||
for (int i = 0; i < N; ++i) {
|
||||
data[i] = n_past + i;
|
||||
}
|
||||
}
|
||||
|
||||
// rope has so much parameters that we make a custom function for it
|
||||
auto rope = [ctx, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
|
||||
auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
|
||||
(struct ggml_tensor * t) -> struct ggml_tensor * {
|
||||
// not capturing these, to silcence warnings
|
||||
const int n_past = 0;
|
||||
const int rope_mode = 0;
|
||||
|
||||
return ggml_rope_custom(ctx,
|
||||
t, n_past, n_rot, rope_mode, n_ctx,
|
||||
t, KQ_pos, n_rot, rope_mode, n_ctx,
|
||||
rope_freq_base, rope_freq_scale);
|
||||
};
|
||||
|
||||
@@ -787,6 +795,8 @@ struct ggml_tensor * llama_build_train_graphs(
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
|
||||
// input gradient
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
|
||||
// KQ_pos
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));
|
||||
GGML_ASSERT(t36->grad->data == NULL && !ggml_is_view(t36->grad));
|
||||
ggml_allocr_alloc(alloc, t36->grad);
|
||||
// gradient tensors (will be set to zero by ggml_graph_reset)
|
||||
@@ -965,10 +975,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
|
||||
|
||||
buf[size] = '\0';
|
||||
|
||||
int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
|
||||
int n_tokens = llama_tokenize(lctx, buf.data(), buf.size(), out.data(), out.size(), false);
|
||||
if (n_tokens < 0) {
|
||||
out.resize(-n_tokens);
|
||||
n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
|
||||
n_tokens = llama_tokenize(lctx, buf.data(), buf.size(), out.data(), out.size(), false);
|
||||
}
|
||||
GGML_ASSERT(n_tokens >= 0);
|
||||
out.resize(n_tokens);
|
||||
|
||||
28
flake.nix
28
flake.nix
@@ -34,7 +34,21 @@
|
||||
with pkgs; [ openblas ]
|
||||
);
|
||||
pkgs = import nixpkgs { inherit system; };
|
||||
nativeBuildInputs = with pkgs; [ cmake ninja pkgconfig ];
|
||||
nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ];
|
||||
cudatoolkit_joined = with pkgs; symlinkJoin {
|
||||
# HACK(Green-Sky): nix currently has issues with cmake findcudatoolkit
|
||||
# see https://github.com/NixOS/nixpkgs/issues/224291
|
||||
# copied from jaxlib
|
||||
name = "${cudaPackages.cudatoolkit.name}-merged";
|
||||
paths = [
|
||||
cudaPackages.cudatoolkit.lib
|
||||
cudaPackages.cudatoolkit.out
|
||||
] ++ lib.optionals (lib.versionOlder cudaPackages.cudatoolkit.version "11") [
|
||||
# for some reason some of the required libs are in the targets/x86_64-linux
|
||||
# directory; not sure why but this works around it
|
||||
"${cudaPackages.cudatoolkit}/targets/${system}"
|
||||
];
|
||||
};
|
||||
llama-python =
|
||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||
postPatch = ''
|
||||
@@ -45,12 +59,15 @@
|
||||
postInstall = ''
|
||||
mv $out/bin/main $out/bin/llama
|
||||
mv $out/bin/server $out/bin/llama-server
|
||||
mkdir -p $out/include
|
||||
cp ${src}/llama.h $out/include/
|
||||
'';
|
||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
|
||||
in
|
||||
{
|
||||
packages.default = pkgs.stdenv.mkDerivation {
|
||||
inherit name src meta postPatch nativeBuildInputs buildInputs postInstall;
|
||||
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||
buildInputs = osSpecific;
|
||||
cmakeFlags = cmakeFlags
|
||||
++ (if isAarch64 && isDarwin then [
|
||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||
@@ -67,6 +84,13 @@
|
||||
"-DLLAMA_CLBLAST=ON"
|
||||
];
|
||||
};
|
||||
packages.cuda = pkgs.stdenv.mkDerivation {
|
||||
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||
buildInputs = with pkgs; buildInputs ++ [ cudatoolkit_joined ];
|
||||
cmakeFlags = cmakeFlags ++ [
|
||||
"-DLLAMA_CUBLAS=ON"
|
||||
];
|
||||
};
|
||||
packages.rocm = pkgs.stdenv.mkDerivation {
|
||||
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||
buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
|
||||
|
||||
12
ggml-alloc.c
12
ggml-alloc.c
@@ -131,6 +131,10 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
|
||||
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
||||
}
|
||||
|
||||
static bool ggml_is_view(struct ggml_tensor * t) {
|
||||
return t->view_src != NULL;
|
||||
}
|
||||
|
||||
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
||||
@@ -338,8 +342,8 @@ static void free_vmem(void * base_addr, size_t size) {
|
||||
|
||||
// allocate uncommitted virtual memory to measure the size of the graph
|
||||
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
||||
// 1TB for 64-bit, 1GB for 32-bit
|
||||
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
|
||||
// 128GB for 64-bit, 1GB for 32-bit
|
||||
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
|
||||
do {
|
||||
*base_addr = alloc_vmem(*size);
|
||||
if (*base_addr != NULL) {
|
||||
@@ -399,10 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
||||
|
||||
//////////// compute graph allocator
|
||||
|
||||
static bool ggml_is_view(struct ggml_tensor * t) {
|
||||
return t->view_src != NULL;
|
||||
}
|
||||
|
||||
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||
if (a->type != b->type) {
|
||||
return false;
|
||||
|
||||
879
ggml-cuda.cu
879
ggml-cuda.cu
File diff suppressed because it is too large
Load Diff
@@ -31,6 +31,7 @@ GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tens
|
||||
|
||||
GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
|
||||
GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_cuda_set_main_device(int main_device);
|
||||
GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
||||
|
||||
@@ -19,6 +19,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
@@ -33,6 +35,8 @@ struct ggml_cgraph;
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
struct ggml_metal_context;
|
||||
|
||||
// number of command buffers to use
|
||||
|
||||
294
ggml-metal.m
294
ggml-metal.m
@@ -11,11 +11,14 @@
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
// TODO: temporary - reuse llama.cpp logging
|
||||
#ifdef GGML_METAL_NDEBUG
|
||||
#define metal_printf(...)
|
||||
#define GGML_METAL_LOG_INFO(...)
|
||||
#define GGML_METAL_LOG_WARN(...)
|
||||
#define GGML_METAL_LOG_ERROR(...)
|
||||
#else
|
||||
#define metal_printf(...) fprintf(stderr, __VA_ARGS__)
|
||||
#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
||||
#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
||||
#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
#endif
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
@@ -66,6 +69,7 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(soft_max_4);
|
||||
GGML_METAL_DECL_KERNEL(diag_mask_inf);
|
||||
GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_f32);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
||||
@@ -77,6 +81,7 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||
GGML_METAL_DECL_KERNEL(norm);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
|
||||
@@ -88,6 +93,7 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
|
||||
@@ -97,7 +103,8 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(rope);
|
||||
GGML_METAL_DECL_KERNEL(rope_f32);
|
||||
GGML_METAL_DECL_KERNEL(rope_f16);
|
||||
GGML_METAL_DECL_KERNEL(alibi_f32);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
||||
@@ -117,8 +124,37 @@ static NSString * const msl_library_source = @"see metal.metal";
|
||||
@implementation GGMLMetalClass
|
||||
@end
|
||||
|
||||
ggml_log_callback ggml_metal_log_callback = NULL;
|
||||
void * ggml_metal_log_user_data = NULL;
|
||||
|
||||
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
||||
ggml_metal_log_callback = log_callback;
|
||||
ggml_metal_log_user_data = user_data;
|
||||
}
|
||||
|
||||
static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
|
||||
if (ggml_metal_log_callback != NULL) {
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
char buffer[128];
|
||||
int len = vsnprintf(buffer, 128, format, args);
|
||||
if (len < 128) {
|
||||
ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
|
||||
} else {
|
||||
char* buffer2 = malloc(len+1);
|
||||
vsnprintf(buffer2, len+1, format, args);
|
||||
buffer2[len] = 0;
|
||||
ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
|
||||
free(buffer2);
|
||||
}
|
||||
va_end(args);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
metal_printf("%s: allocating\n", __func__);
|
||||
GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
|
||||
|
||||
id <MTLDevice> device;
|
||||
NSString * s;
|
||||
@@ -128,14 +164,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
NSArray * devices = MTLCopyAllDevices();
|
||||
for (device in devices) {
|
||||
s = [device name];
|
||||
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
|
||||
GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Pick and show default Metal device
|
||||
device = MTLCreateSystemDefaultDevice();
|
||||
s = [device name];
|
||||
metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
||||
GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
||||
|
||||
// Configure context
|
||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||
@@ -145,7 +181,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
ctx->n_buffers = 0;
|
||||
ctx->concur_list_len = 0;
|
||||
|
||||
ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
||||
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
#ifdef GGML_SWIFT
|
||||
// load the default.metallib file
|
||||
@@ -162,7 +198,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
||||
|
||||
if (error) {
|
||||
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@@ -175,12 +211,12 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
|
||||
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
|
||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
|
||||
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]);
|
||||
|
||||
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -192,7 +228,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
|
||||
#endif
|
||||
if (error) {
|
||||
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@@ -204,11 +240,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
#define GGML_METAL_ADD_KERNEL(name) \
|
||||
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
||||
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
||||
metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
||||
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
||||
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
||||
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
||||
if (error) { \
|
||||
metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
return NULL; \
|
||||
}
|
||||
|
||||
@@ -224,6 +260,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(soft_max_4);
|
||||
GGML_METAL_ADD_KERNEL(diag_mask_inf);
|
||||
GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_f32);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
||||
@@ -235,6 +272,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||
GGML_METAL_ADD_KERNEL(norm);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
|
||||
@@ -246,6 +284,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
|
||||
@@ -255,7 +294,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(rope);
|
||||
GGML_METAL_ADD_KERNEL(rope_f32);
|
||||
GGML_METAL_ADD_KERNEL(rope_f16);
|
||||
GGML_METAL_ADD_KERNEL(alibi_f32);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
||||
@@ -264,13 +304,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
#undef GGML_METAL_ADD_KERNEL
|
||||
}
|
||||
|
||||
metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
#if TARGET_OS_OSX
|
||||
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
if (ctx->device.maxTransferRate != 0) {
|
||||
metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||
} else {
|
||||
metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
|
||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -278,7 +318,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
}
|
||||
|
||||
void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
metal_printf("%s: deallocating\n", __func__);
|
||||
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
|
||||
#define GGML_METAL_DEL_KERNEL(name) \
|
||||
[ctx->function_##name release]; \
|
||||
[ctx->pipeline_##name release];
|
||||
@@ -293,7 +333,9 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
GGML_METAL_DEL_KERNEL(gelu);
|
||||
GGML_METAL_DEL_KERNEL(soft_max);
|
||||
GGML_METAL_DEL_KERNEL(soft_max_4);
|
||||
GGML_METAL_DEL_KERNEL(diag_mask_inf);
|
||||
GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_f32);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_f16);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
|
||||
@@ -305,6 +347,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_DEL_KERNEL(rms_norm);
|
||||
GGML_METAL_DEL_KERNEL(norm);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
|
||||
@@ -316,6 +359,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
|
||||
@@ -325,7 +369,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(rope);
|
||||
GGML_METAL_DEL_KERNEL(rope_f32);
|
||||
GGML_METAL_DEL_KERNEL(rope_f16);
|
||||
GGML_METAL_DEL_KERNEL(alibi_f32);
|
||||
GGML_METAL_DEL_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_DEL_KERNEL(cpy_f32_f32);
|
||||
@@ -350,7 +395,7 @@ void * ggml_metal_host_malloc(size_t n) {
|
||||
void * data = NULL;
|
||||
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
||||
if (result != 0) {
|
||||
metal_printf("%s: error: posix_memalign failed\n", __func__);
|
||||
GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -378,7 +423,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
|
||||
// Metal buffer based on the host memory pointer
|
||||
//
|
||||
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
||||
//metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
||||
//GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
||||
|
||||
const int64_t tsize = ggml_nbytes(t);
|
||||
|
||||
@@ -386,16 +431,17 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
||||
|
||||
//metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
|
||||
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
||||
*offs = (size_t) ioffs;
|
||||
|
||||
//metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
||||
//GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
||||
|
||||
return ctx->buffers[i].metal;
|
||||
}
|
||||
}
|
||||
|
||||
metal_printf("%s: error: buffer is nil\n", __func__);
|
||||
GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__);
|
||||
|
||||
return nil;
|
||||
}
|
||||
@@ -407,7 +453,7 @@ bool ggml_metal_add_buffer(
|
||||
size_t size,
|
||||
size_t max_size) {
|
||||
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
|
||||
metal_printf("%s: too many buffers\n", __func__);
|
||||
GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -417,7 +463,7 @@ bool ggml_metal_add_buffer(
|
||||
const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
|
||||
|
||||
if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
|
||||
metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
|
||||
GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -438,11 +484,11 @@ bool ggml_metal_add_buffer(
|
||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||
|
||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||
metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||
return false;
|
||||
}
|
||||
|
||||
metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||
|
||||
++ctx->n_buffers;
|
||||
} else {
|
||||
@@ -462,13 +508,13 @@ bool ggml_metal_add_buffer(
|
||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||
|
||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||
metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
||||
return false;
|
||||
}
|
||||
|
||||
metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
||||
if (i + size_step < size) {
|
||||
metal_printf("\n");
|
||||
GGML_METAL_LOG_INFO("\n");
|
||||
}
|
||||
|
||||
++ctx->n_buffers;
|
||||
@@ -476,17 +522,17 @@ bool ggml_metal_add_buffer(
|
||||
}
|
||||
|
||||
#if TARGET_OS_OSX
|
||||
metal_printf(", (%8.2f / %8.2f)",
|
||||
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
||||
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
|
||||
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
||||
metal_printf(", warning: current allocated size is greater than the recommended max working set size\n");
|
||||
GGML_METAL_LOG_WARN(", warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
||||
} else {
|
||||
metal_printf("\n");
|
||||
GGML_METAL_LOG_INFO("\n");
|
||||
}
|
||||
#else
|
||||
metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -599,7 +645,7 @@ void ggml_metal_graph_find_concurrency(
|
||||
}
|
||||
|
||||
if (ctx->concur_list_len > GGML_MAX_CONCUR) {
|
||||
metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__);
|
||||
GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -653,7 +699,7 @@ void ggml_metal_graph_compute(
|
||||
continue;
|
||||
}
|
||||
|
||||
//metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
||||
//GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
||||
|
||||
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
|
||||
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
|
||||
@@ -697,17 +743,17 @@ void ggml_metal_graph_compute(
|
||||
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
|
||||
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil;
|
||||
|
||||
//metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
||||
//GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
||||
//if (src0) {
|
||||
// metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
|
||||
// GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
|
||||
// ggml_is_contiguous(src0), src0->name);
|
||||
//}
|
||||
//if (src1) {
|
||||
// metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
|
||||
// GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
|
||||
// ggml_is_contiguous(src1), src1->name);
|
||||
//}
|
||||
//if (dst) {
|
||||
// metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2,
|
||||
// GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2,
|
||||
// dst->name);
|
||||
//}
|
||||
|
||||
@@ -723,29 +769,66 @@ void ggml_metal_graph_compute(
|
||||
case GGML_OP_ADD:
|
||||
{
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
|
||||
// utilize float4
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
const int64_t nb = ne00/4;
|
||||
bool bcast_row = false;
|
||||
|
||||
if (ggml_nelements(src1) == ne10) {
|
||||
int64_t nb = ne00;
|
||||
|
||||
if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) {
|
||||
// src1 is a row
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
|
||||
nb = ne00 / 4;
|
||||
[encoder setComputePipelineState:ctx->pipeline_add_row];
|
||||
|
||||
bcast_row = true;
|
||||
} else {
|
||||
[encoder setComputePipelineState:ctx->pipeline_add];
|
||||
}
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&nb length:sizeof(nb) atIndex:3];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
|
||||
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
|
||||
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
|
||||
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
|
||||
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
|
||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
|
||||
[encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21];
|
||||
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26];
|
||||
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
|
||||
|
||||
const int64_t n = ggml_nelements(dst)/4;
|
||||
if (bcast_row) {
|
||||
const int64_t n = ggml_nelements(dst)/4;
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} else {
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_MUL:
|
||||
{
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
|
||||
// utilize float4
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
@@ -753,6 +836,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
if (ggml_nelements(src1) == ne10) {
|
||||
// src1 is a row
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_row];
|
||||
} else {
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul];
|
||||
@@ -768,6 +852,8 @@ void ggml_metal_graph_compute(
|
||||
} break;
|
||||
case GGML_OP_SCALE:
|
||||
{
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
const float scale = *(const float *) src1->data;
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_scale];
|
||||
@@ -813,13 +899,13 @@ void ggml_metal_graph_compute(
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||
GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
{
|
||||
const int nth = 32;
|
||||
const int nth = MIN(32, ne00);
|
||||
|
||||
if (ne00%4 == 0) {
|
||||
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
|
||||
@@ -867,13 +953,14 @@ void ggml_metal_graph_compute(
|
||||
|
||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||
if (ggml_is_contiguous(src0) &&
|
||||
ggml_is_contiguous(src1) &&
|
||||
if (!ggml_is_transposed(src0) &&
|
||||
!ggml_is_transposed(src1) &&
|
||||
src1t == GGML_TYPE_F32 &&
|
||||
[ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
||||
ne00%32 == 0 &&
|
||||
ne11 > 1) {
|
||||
ne11 > 2) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
|
||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
|
||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
|
||||
@@ -893,9 +980,12 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
|
||||
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
|
||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
|
||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8];
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9];
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12];
|
||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:13];
|
||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
} else {
|
||||
@@ -905,6 +995,11 @@ void ggml_metal_graph_compute(
|
||||
|
||||
// use custom matrix x vector kernel
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
|
||||
nrows = 4;
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
nth0 = 32;
|
||||
@@ -993,7 +1088,7 @@ void ggml_metal_graph_compute(
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
metal_printf("Asserting on type %d\n",(int)src0t);
|
||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
||||
GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
};
|
||||
@@ -1045,6 +1140,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_get_rows_f32]; break;
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
||||
@@ -1060,9 +1156,9 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
|
||||
[encoder setBytes:&(dst->nb[1]) length:sizeof(uint64_t) atIndex:5];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:5];
|
||||
|
||||
const int64_t n = ggml_nelements(src1);
|
||||
|
||||
@@ -1073,7 +1169,7 @@ void ggml_metal_graph_compute(
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
const int nth = 512;
|
||||
const int nth = MIN(512, ne00);
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_rms_norm];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
@@ -1092,7 +1188,7 @@ void ggml_metal_graph_compute(
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
const int nth = 256;
|
||||
const int nth = MIN(256, ne00);
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_norm];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
@@ -1110,6 +1206,8 @@ void ggml_metal_graph_compute(
|
||||
{
|
||||
GGML_ASSERT((src0t == GGML_TYPE_F32));
|
||||
|
||||
const int nth = MIN(1024, ne00);
|
||||
|
||||
const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
|
||||
const int n_head = ((int32_t *) dst->op_params)[1];
|
||||
float max_bias;
|
||||
@@ -1143,12 +1241,14 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
|
||||
|
||||
const int nth = 32;
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_ROPE:
|
||||
{
|
||||
GGML_ASSERT(ne10 == ne02);
|
||||
|
||||
const int nth = MIN(1024, ne00);
|
||||
|
||||
const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
@@ -1158,38 +1258,44 @@ void ggml_metal_graph_compute(
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_rope];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:18];
|
||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
|
||||
[encoder setBytes:&mode length:sizeof( int) atIndex:20];
|
||||
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
|
||||
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_rope_f16]; break;
|
||||
default: GGML_ASSERT(false);
|
||||
};
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18];
|
||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:19];
|
||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
|
||||
[encoder setBytes:&mode length:sizeof( int) atIndex:21];
|
||||
[encoder setBytes:&freq_base length:sizeof(float) atIndex:22];
|
||||
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_CONT:
|
||||
{
|
||||
const int nth = 32;
|
||||
const int nth = MIN(1024, ne00);
|
||||
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F32:
|
||||
@@ -1234,7 +1340,7 @@ void ggml_metal_graph_compute(
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||
GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
@@ -1259,7 +1365,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
|
||||
if (status != MTLCommandBufferStatusCompleted) {
|
||||
metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
342
ggml-metal.metal
342
ggml-metal.metal
@@ -24,12 +24,59 @@ typedef struct {
|
||||
int8_t qs[QK8_0]; // quants
|
||||
} block_q8_0;
|
||||
|
||||
// general-purpose kernel for addition of two tensors
|
||||
// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
|
||||
// cons: not very efficient
|
||||
kernel void kernel_add(
|
||||
device const float4 * src0,
|
||||
device const float4 * src1,
|
||||
device float4 * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] + src1[tpig];
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant int64_t & nb00,
|
||||
constant int64_t & nb01,
|
||||
constant int64_t & nb02,
|
||||
constant int64_t & nb03,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant int64_t & nb10,
|
||||
constant int64_t & nb11,
|
||||
constant int64_t & nb12,
|
||||
constant int64_t & nb13,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant int64_t & nb0,
|
||||
constant int64_t & nb1,
|
||||
constant int64_t & nb2,
|
||||
constant int64_t & nb3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t i03 = tgpig.z;
|
||||
const int64_t i02 = tgpig.y;
|
||||
const int64_t i01 = tgpig.x;
|
||||
|
||||
const int64_t i13 = i03 % ne13;
|
||||
const int64_t i12 = i02 % ne12;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
|
||||
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
|
||||
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
|
||||
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||
((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0] + ((device float *)src1_ptr)[0];
|
||||
|
||||
src0_ptr += ntg.x*nb00;
|
||||
src1_ptr += ntg.x*nb10;
|
||||
dst_ptr += ntg.x*nb0;
|
||||
}
|
||||
}
|
||||
|
||||
// assumption: src1 is a row
|
||||
@@ -38,7 +85,7 @@ kernel void kernel_add_row(
|
||||
device const float4 * src0,
|
||||
device const float4 * src1,
|
||||
device float4 * dst,
|
||||
constant int64_t & nb,
|
||||
constant int64_t & nb [[buffer(27)]],
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] + src1[tpig % nb];
|
||||
}
|
||||
@@ -118,7 +165,7 @@ kernel void kernel_soft_max(
|
||||
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
|
||||
// parallel max
|
||||
float lmax = psrc0[tpitg[0]];
|
||||
float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
|
||||
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
|
||||
lmax = MAX(lmax, psrc0[i00]);
|
||||
}
|
||||
@@ -158,7 +205,7 @@ kernel void kernel_soft_max_4(
|
||||
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
|
||||
// parallel max
|
||||
float4 lmax4 = psrc4[tpitg[0]];
|
||||
float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
|
||||
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
||||
lmax4 = fmax(lmax4, psrc4[i00]);
|
||||
}
|
||||
@@ -523,6 +570,79 @@ kernel void kernel_mul_mat_q8_0_f32(
|
||||
}
|
||||
}
|
||||
|
||||
#define N_F32_F32 4
|
||||
|
||||
kernel void kernel_mul_mat_f32_f32(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t rb = tgpig.y*N_F32_F32;
|
||||
const int64_t im = tgpig.z;
|
||||
|
||||
device const float * x = (device const float *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
||||
|
||||
if (ne00 < 128) {
|
||||
for (int row = 0; row < N_F32_F32; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tiisg; i < ne00; i += 32) {
|
||||
sumf += (float) x[i] * (float) y[i];
|
||||
}
|
||||
|
||||
float all_sum = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
device const float4 * x4 = (device const float4 *)x;
|
||||
for (int row = 0; row < N_F32_F32; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||
device const float4 * y4 = (device const float4 *) y;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tiisg; i < ne00/4; i += 32) {
|
||||
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
|
||||
}
|
||||
|
||||
float all_sum = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_f16_f32_1row(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
@@ -733,30 +853,61 @@ kernel void kernel_alibi_f32(
|
||||
}
|
||||
}
|
||||
|
||||
typedef void (rope_t)(
|
||||
device const void * src0,
|
||||
device const int32_t * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
constant int & n_past,
|
||||
constant int & n_dims,
|
||||
constant int & mode,
|
||||
constant float & freq_base,
|
||||
constant float & freq_scale,
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint3 tptg[[threads_per_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]]);
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_rope(
|
||||
device const void * src0,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
constant int & n_past,
|
||||
constant int & n_dims,
|
||||
constant int & mode,
|
||||
constant float & freq_base,
|
||||
constant float & freq_scale,
|
||||
device const void * src0,
|
||||
device const int32_t * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
constant int & n_past,
|
||||
constant int & n_dims,
|
||||
constant int & mode,
|
||||
constant float & freq_base,
|
||||
constant float & freq_scale,
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint3 tptg[[threads_per_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||
@@ -766,7 +917,9 @@ kernel void kernel_rope(
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
|
||||
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
||||
device const int32_t * pos = src1;
|
||||
|
||||
const int64_t p = pos[i2];
|
||||
|
||||
const float theta_0 = freq_scale * (float)p;
|
||||
const float inv_ndims = -1.f/n_dims;
|
||||
@@ -778,11 +931,11 @@ kernel void kernel_rope(
|
||||
const float cos_theta = cos(theta);
|
||||
const float sin_theta = sin(theta);
|
||||
|
||||
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[1];
|
||||
const T x0 = src[0];
|
||||
const T x1 = src[1];
|
||||
|
||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
||||
@@ -797,8 +950,8 @@ kernel void kernel_rope(
|
||||
|
||||
const int64_t i0 = ib*n_dims + ic/2;
|
||||
|
||||
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[n_dims/2];
|
||||
@@ -810,6 +963,9 @@ kernel void kernel_rope(
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
|
||||
template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
|
||||
|
||||
kernel void kernel_cpy_f16_f16(
|
||||
device const half * src0,
|
||||
device half * dst,
|
||||
@@ -1200,8 +1356,8 @@ kernel void kernel_mul_mat_q3_K_f32(
|
||||
|
||||
float yl[32];
|
||||
|
||||
const uint16_t kmask1 = 0x3030;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
//const uint16_t kmask1 = 0x3030;
|
||||
//const uint16_t kmask2 = 0x0f0f;
|
||||
|
||||
const int tid = tiisg/4;
|
||||
const int ix = tiisg%4;
|
||||
@@ -1321,7 +1477,6 @@ kernel void kernel_mul_mat_q3_K_f32(
|
||||
dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
kernel void kernel_mul_mat_q3_K_f32(
|
||||
@@ -1400,13 +1555,13 @@ kernel void kernel_mul_mat_q4_K_f32(
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
constant int64_t & ne02[[buffer(5)]],
|
||||
constant int64_t & ne10[[buffer(9)]],
|
||||
constant int64_t & ne12[[buffer(11)]],
|
||||
constant int64_t & ne0[[buffer(15)]],
|
||||
constant int64_t & ne1[[buffer(16)]],
|
||||
constant uint & gqa[[buffer(17)]],
|
||||
constant int64_t & ne01 [[buffer(4)]],
|
||||
constant int64_t & ne02 [[buffer(5)]],
|
||||
constant int64_t & ne10 [[buffer(9)]],
|
||||
constant int64_t & ne12 [[buffer(11)]],
|
||||
constant int64_t & ne0 [[buffer(15)]],
|
||||
constant int64_t & ne1 [[buffer(16)]],
|
||||
constant uint & gqa [[buffer(17)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
@@ -1865,6 +2020,15 @@ kernel void kernel_mul_mat_q6_K_f32(
|
||||
|
||||
//============================= templates and their specializations =============================
|
||||
|
||||
// NOTE: this is not dequantizing - we are simply fitting the template
|
||||
template <typename type4x4>
|
||||
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
|
||||
float4x4 temp = *(((device float4x4 *)src));
|
||||
for (int i = 0; i < 16; i++){
|
||||
reg[i/4][i%4] = temp[i/4][i%4];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
|
||||
half4x4 temp = *(((device half4x4 *)src));
|
||||
@@ -1875,7 +2039,6 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
|
||||
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
||||
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
||||
const float d2 = d1 / 256.f;
|
||||
@@ -1887,12 +2050,10 @@ void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg
|
||||
reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
|
||||
reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
|
||||
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
||||
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
||||
const float d2 = d1 / 256.f;
|
||||
@@ -1964,7 +2125,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
||||
}
|
||||
|
||||
#else
|
||||
float kcoef = il&1 ? 1.f/16.f : 1.f;
|
||||
uint16_t kmask = il&1 ? 0xF0 : 0x0F;
|
||||
@@ -2008,7 +2168,6 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
@@ -2110,22 +2269,25 @@ kernel void kernel_get_rows(
|
||||
// each block_q contains 16*nl weights
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
|
||||
kernel void kernel_mul_mm(device const uchar * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & nb01,
|
||||
constant int64_t & nb02,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & gqa,
|
||||
threadgroup uchar * shared_memory [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
device const uchar * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & nb01,
|
||||
constant int64_t & nb02,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & nb10,
|
||||
constant int64_t & nb11,
|
||||
constant int64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & gqa,
|
||||
threadgroup uchar * shared_memory [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
threadgroup half * sa = ((threadgroup half *)shared_memory);
|
||||
threadgroup half * sa = (threadgroup half *)(shared_memory);
|
||||
threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
|
||||
|
||||
const uint r0 = tgpig.y;
|
||||
@@ -2138,7 +2300,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
||||
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
||||
|
||||
simdgroup_half8x8 ma[4];
|
||||
simdgroup_half8x8 ma[4];
|
||||
simdgroup_float8x8 mb[2];
|
||||
simdgroup_float8x8 c_res[8];
|
||||
for (int i = 0; i < 8; i++){
|
||||
@@ -2146,10 +2308,15 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
}
|
||||
|
||||
short il = (tiitg % THREAD_PER_ROW);
|
||||
uint offset0 = im/gqa*nb02; ushort offset1 = il/nl;
|
||||
device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
|
||||
device const float * y = src1 + (r1 * BLOCK_SIZE_N + thread_col) * ne00 \
|
||||
+ BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL) + im * ne00 * ne1;
|
||||
|
||||
uint offset0 = im/gqa*nb02;
|
||||
ushort offset1 = il/nl;
|
||||
|
||||
device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
|
||||
device const float * y = (device const float *)(src1
|
||||
+ nb12 * im
|
||||
+ nb11 * (r1 * BLOCK_SIZE_N + thread_col)
|
||||
+ nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
|
||||
|
||||
for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
|
||||
//load data and store to threadgroup memory
|
||||
@@ -2229,6 +2396,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \
|
||||
constant uint64_t &, constant uint64_t &, uint, uint, uint);
|
||||
|
||||
template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows<float4x4, 1, dequantize_f32>;
|
||||
template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
|
||||
template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
|
||||
@@ -2239,14 +2407,28 @@ template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows
|
||||
template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
|
||||
template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
|
||||
typedef void (mat_mm_t)(device const uchar *, device const float *, device float *, constant int64_t &,\
|
||||
constant int64_t &, constant int64_t &, constant int64_t &, constant int64_t &, \
|
||||
constant int64_t &, constant int64_t &, constant uint &, threadgroup uchar *, uint3, uint, uint);
|
||||
typedef void (mat_mm_t)(
|
||||
device const uchar * src0,
|
||||
device const uchar * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & nb01,
|
||||
constant int64_t & nb02,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & nb10,
|
||||
constant int64_t & nb11,
|
||||
constant int64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & gqa,
|
||||
threadgroup uchar *, uint3, uint, uint);
|
||||
|
||||
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
|
||||
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
|
||||
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
|
||||
template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm<float4x4, 1, dequantize_f32>;
|
||||
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
|
||||
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
|
||||
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
|
||||
template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
|
||||
template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
|
||||
template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
|
||||
|
||||
@@ -847,7 +847,7 @@ std::array<std::string, 2> mul_str_values = {
|
||||
"mul_f32", "float"
|
||||
};
|
||||
|
||||
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
||||
static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(from, pos)) != std::string::npos) {
|
||||
s.replace(pos, from.length(), to);
|
||||
@@ -856,7 +856,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
|
||||
return s;
|
||||
}
|
||||
|
||||
std::string generate_kernels() {
|
||||
static std::string generate_kernels() {
|
||||
std::stringstream src;
|
||||
src << program_source << '\n';
|
||||
src << k_quants_source << '\n';
|
||||
@@ -1788,7 +1788,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
|
||||
static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
|
||||
// If device doesn't support FP16
|
||||
if (!fp16_support) {
|
||||
return false;
|
||||
|
||||
278
ggml.c
278
ggml.c
@@ -4303,10 +4303,21 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
||||
}
|
||||
|
||||
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
||||
size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
||||
size_t nbytes;
|
||||
size_t blck_size = ggml_blck_size(tensor->type);
|
||||
if (blck_size == 1) {
|
||||
nbytes = ggml_type_size(tensor->type);
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
||||
}
|
||||
}
|
||||
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
@@ -6395,6 +6406,54 @@ struct ggml_tensor * ggml_cont_inplace(
|
||||
return ggml_cont_impl(ctx, a, true);
|
||||
}
|
||||
|
||||
|
||||
// make contiguous, with new shape
|
||||
GGML_API struct ggml_tensor * ggml_cont_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0) {
|
||||
return ggml_cont_4d(ctx, a, ne0, 1, 1, 1);
|
||||
}
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_cont_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1) {
|
||||
return ggml_cont_4d(ctx, a, ne0, ne1, 1, 1);
|
||||
}
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_cont_3d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2) {
|
||||
return ggml_cont_4d(ctx, a, ne0, ne1, ne2, 1);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_cont_4d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int64_t ne3) {
|
||||
GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3));
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
|
||||
ggml_format_name(result, "%s (cont)", a->name);
|
||||
|
||||
result->op = GGML_OP_CONT;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// ggml_reshape
|
||||
|
||||
struct ggml_tensor * ggml_reshape(
|
||||
@@ -6957,7 +7016,7 @@ struct ggml_tensor * ggml_soft_max_back_inplace(
|
||||
static struct ggml_tensor * ggml_rope_impl(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
struct ggml_tensor * b,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
@@ -6966,7 +7025,10 @@ static struct ggml_tensor * ggml_rope_impl(
|
||||
float xpos_base,
|
||||
bool xpos_down,
|
||||
bool inplace) {
|
||||
GGML_ASSERT(n_past >= 0);
|
||||
GGML_ASSERT(ggml_is_vector(b));
|
||||
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad) {
|
||||
@@ -6975,7 +7037,7 @@ static struct ggml_tensor * ggml_rope_impl(
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
int32_t params[8] = { n_past, n_dims, mode, n_ctx };
|
||||
int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
|
||||
memcpy(params + 4, &freq_base, sizeof(float));
|
||||
memcpy(params + 5, &freq_scale, sizeof(float));
|
||||
memcpy(params + 6, &xpos_base, sizeof(float));
|
||||
@@ -6985,6 +7047,7 @@ static struct ggml_tensor * ggml_rope_impl(
|
||||
result->op = GGML_OP_ROPE;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -6992,55 +7055,55 @@ static struct ggml_tensor * ggml_rope_impl(
|
||||
struct ggml_tensor * ggml_rope(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
struct ggml_tensor * b,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
|
||||
return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
struct ggml_tensor * b,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
|
||||
return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_custom(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
struct ggml_tensor * b,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
float freq_base,
|
||||
float freq_scale) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
|
||||
return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_custom_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
struct ggml_tensor * b,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
float freq_base,
|
||||
float freq_scale) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
|
||||
return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_xpos_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
struct ggml_tensor * b,
|
||||
int n_dims,
|
||||
float base,
|
||||
bool down) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
|
||||
return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
|
||||
}
|
||||
|
||||
// ggml_rope_back
|
||||
@@ -7048,7 +7111,7 @@ struct ggml_tensor * ggml_rope_xpos_inplace(
|
||||
struct ggml_tensor * ggml_rope_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
struct ggml_tensor * b,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
@@ -7056,7 +7119,10 @@ struct ggml_tensor * ggml_rope_back(
|
||||
float freq_scale,
|
||||
float xpos_base,
|
||||
bool xpos_down) {
|
||||
GGML_ASSERT(n_past >= 0);
|
||||
GGML_ASSERT(ggml_is_vector(b));
|
||||
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
||||
|
||||
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
||||
|
||||
bool is_node = false;
|
||||
@@ -7067,7 +7133,7 @@ struct ggml_tensor * ggml_rope_back(
|
||||
|
||||
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
||||
|
||||
int32_t params[8] = { n_past, n_dims, mode, n_ctx };
|
||||
int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
|
||||
memcpy(params + 4, &freq_base, sizeof(float));
|
||||
memcpy(params + 5, &freq_scale, sizeof(float));
|
||||
memcpy(params + 6, &xpos_base, sizeof(float));
|
||||
@@ -7077,6 +7143,7 @@ struct ggml_tensor * ggml_rope_back(
|
||||
result->op = GGML_OP_ROPE_BACK;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -8787,8 +8854,6 @@ static void ggml_compute_forward_add_f32(
|
||||
#else
|
||||
ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
|
||||
#endif
|
||||
// }
|
||||
// }
|
||||
}
|
||||
} else {
|
||||
// src1 is not contiguous
|
||||
@@ -12445,13 +12510,11 @@ static void ggml_compute_forward_alibi_f16(
|
||||
return;
|
||||
}
|
||||
|
||||
const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_head = ((int32_t *) dst->op_params)[1];
|
||||
float max_bias;
|
||||
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
||||
const int ne1 = src0->ne[1]; // seq_len_without_past
|
||||
const int ne2 = src0->ne[2]; // n_head -> this is k
|
||||
@@ -12466,7 +12529,7 @@ static void ggml_compute_forward_alibi_f16(
|
||||
//const int nb3 = src0->nb[3];
|
||||
|
||||
GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
|
||||
//GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
|
||||
GGML_ASSERT(n_head == ne2);
|
||||
|
||||
// add alibi to src0 (KQ_scaled)
|
||||
@@ -12612,8 +12675,8 @@ static void ggml_compute_forward_clamp(
|
||||
static void ggml_compute_forward_rope_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
@@ -12623,9 +12686,9 @@ static void ggml_compute_forward_rope_f32(
|
||||
|
||||
// these two only relevant for xPos RoPE:
|
||||
float xpos_base;
|
||||
bool xpos_down;
|
||||
bool xpos_down;
|
||||
|
||||
const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||
@@ -12634,8 +12697,6 @@ static void ggml_compute_forward_rope_f32(
|
||||
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||
|
||||
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
||||
@@ -12666,9 +12727,11 @@ static void ggml_compute_forward_rope_f32(
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
|
||||
const int32_t * pos = (const int32_t *) src1->data;
|
||||
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
||||
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||
const int64_t p = pos[i2];
|
||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||
if (ir++ < ir0) continue;
|
||||
if (ir > ir1) break;
|
||||
@@ -12705,7 +12768,7 @@ static void ggml_compute_forward_rope_f32(
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
// zeta scaling for xPos only:
|
||||
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
|
||||
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
||||
if (xpos_down) zeta = 1.0f / zeta;
|
||||
|
||||
theta *= theta_scale;
|
||||
@@ -12750,8 +12813,8 @@ static void ggml_compute_forward_rope_f32(
|
||||
static void ggml_compute_forward_rope_f16(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
@@ -12759,15 +12822,13 @@ static void ggml_compute_forward_rope_f16(
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
|
||||
const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||
|
||||
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
||||
@@ -12798,9 +12859,11 @@ static void ggml_compute_forward_rope_f16(
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
|
||||
const int32_t * pos = (const int32_t *) src1->data;
|
||||
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
||||
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||
const int64_t p = pos[i2];
|
||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||
if (ir++ < ir0) continue;
|
||||
if (ir > ir1) break;
|
||||
@@ -12879,15 +12942,16 @@ static void ggml_compute_forward_rope_f16(
|
||||
static void ggml_compute_forward_rope(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_rope_f16(params, src0, dst);
|
||||
ggml_compute_forward_rope_f16(params, src0, src1, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_rope_f32(params, src0, dst);
|
||||
ggml_compute_forward_rope_f32(params, src0, src1, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -12901,6 +12965,7 @@ static void ggml_compute_forward_rope(
|
||||
static void ggml_compute_forward_rope_back_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
@@ -12918,7 +12983,7 @@ static void ggml_compute_forward_rope_back_f32(
|
||||
float xpos_base;
|
||||
bool xpos_down;
|
||||
|
||||
const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
|
||||
@@ -12927,8 +12992,6 @@ static void ggml_compute_forward_rope_back_f32(
|
||||
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||
|
||||
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
||||
@@ -12955,9 +13018,11 @@ static void ggml_compute_forward_rope_back_f32(
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
|
||||
const int32_t * pos = (const int32_t *) src1->data;
|
||||
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
||||
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||
const int64_t p = pos[i2];
|
||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||
if (ir++ < ir0) continue;
|
||||
if (ir > ir1) break;
|
||||
@@ -12969,7 +13034,7 @@ static void ggml_compute_forward_rope_back_f32(
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
// zeta scaling for xPos only:
|
||||
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
|
||||
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
||||
if (xpos_down) zeta = 1.0f / zeta;
|
||||
|
||||
theta *= theta_scale;
|
||||
@@ -13012,6 +13077,7 @@ static void ggml_compute_forward_rope_back_f32(
|
||||
static void ggml_compute_forward_rope_back_f16(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
@@ -13022,12 +13088,10 @@ static void ggml_compute_forward_rope_back_f16(
|
||||
// dx = rope_back(dy, src1)
|
||||
// src0 is dy, src1 contains options
|
||||
|
||||
const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||
|
||||
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
||||
@@ -13054,9 +13118,11 @@ static void ggml_compute_forward_rope_back_f16(
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
|
||||
const int32_t * pos = (const int32_t *) src1->data;
|
||||
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
||||
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||
const int64_t p = pos[i2];
|
||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||
if (ir++ < ir0) continue;
|
||||
if (ir > ir1) break;
|
||||
@@ -13108,15 +13174,16 @@ static void ggml_compute_forward_rope_back_f16(
|
||||
static void ggml_compute_forward_rope_back(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_rope_back_f16(params, src0, dst);
|
||||
ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_rope_back_f32(params, src0, dst);
|
||||
ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -15853,11 +15920,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
} break;
|
||||
case GGML_OP_ROPE:
|
||||
{
|
||||
ggml_compute_forward_rope(params, tensor->src[0], tensor);
|
||||
ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
|
||||
} break;
|
||||
case GGML_OP_ROPE_BACK:
|
||||
{
|
||||
ggml_compute_forward_rope_back(params, tensor->src[0], tensor);
|
||||
ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
|
||||
} break;
|
||||
case GGML_OP_ALIBI:
|
||||
{
|
||||
@@ -16495,7 +16562,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
{
|
||||
// necessary for llama
|
||||
if (src0->grad) {
|
||||
const int n_past = ((int32_t *) tensor->op_params)[0];
|
||||
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
||||
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
||||
const int mode = ((int32_t *) tensor->op_params)[2];
|
||||
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
||||
@@ -16512,7 +16579,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
src0->grad,
|
||||
ggml_rope_back(ctx,
|
||||
tensor->grad,
|
||||
n_past,
|
||||
src1,
|
||||
n_dims,
|
||||
mode,
|
||||
n_ctx,
|
||||
@@ -16526,7 +16593,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
case GGML_OP_ROPE_BACK:
|
||||
{
|
||||
if (src0->grad) {
|
||||
const int n_past = ((int32_t *) tensor->op_params)[0];
|
||||
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
||||
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
||||
const int mode = ((int32_t *) tensor->op_params)[2];
|
||||
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
||||
@@ -16543,7 +16610,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
src0->grad,
|
||||
ggml_rope_impl(ctx,
|
||||
tensor->grad,
|
||||
n_past,
|
||||
src1,
|
||||
n_dims,
|
||||
mode,
|
||||
n_ctx,
|
||||
@@ -17283,10 +17350,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
} else {
|
||||
// wait for other threads to finish
|
||||
const int last = node_n;
|
||||
do {
|
||||
//sched_yield();
|
||||
while (true) {
|
||||
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
||||
// depending on the workload and the operating system.
|
||||
// since it is not clear what is the best approach, it should potentially become user-configurable
|
||||
// ref: https://github.com/ggerganov/ggml/issues/291
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
sched_yield();
|
||||
#endif
|
||||
|
||||
node_n = atomic_load(&state->shared->node_n);
|
||||
} while (node_n == last);
|
||||
if (node_n != last) break;
|
||||
};
|
||||
}
|
||||
|
||||
// check if we should stop
|
||||
@@ -18337,10 +18412,11 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
||||
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||
struct ggml_tensor * node = cgraph->leafs[i];
|
||||
|
||||
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
|
||||
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
|
||||
i,
|
||||
node->ne[0], node->ne[1],
|
||||
ggml_op_name(node->op));
|
||||
ggml_op_name(node->op),
|
||||
ggml_get_name(node));
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_OP_COUNT; i++) {
|
||||
@@ -20099,27 +20175,27 @@ const char * gguf_type_name(enum gguf_type type) {
|
||||
return GGUF_TYPE_NAME[type];
|
||||
}
|
||||
|
||||
int gguf_get_version(struct gguf_context * ctx) {
|
||||
int gguf_get_version(const struct gguf_context * ctx) {
|
||||
return ctx->header.version;
|
||||
}
|
||||
|
||||
size_t gguf_get_alignment(struct gguf_context * ctx) {
|
||||
size_t gguf_get_alignment(const struct gguf_context * ctx) {
|
||||
return ctx->alignment;
|
||||
}
|
||||
|
||||
size_t gguf_get_data_offset(struct gguf_context * ctx) {
|
||||
size_t gguf_get_data_offset(const struct gguf_context * ctx) {
|
||||
return ctx->offset;
|
||||
}
|
||||
|
||||
void * gguf_get_data(struct gguf_context * ctx) {
|
||||
void * gguf_get_data(const struct gguf_context * ctx) {
|
||||
return ctx->data;
|
||||
}
|
||||
|
||||
int gguf_get_n_kv(struct gguf_context * ctx) {
|
||||
int gguf_get_n_kv(const struct gguf_context * ctx) {
|
||||
return ctx->header.n_kv;
|
||||
}
|
||||
|
||||
int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
||||
int gguf_find_key(const struct gguf_context * ctx, const char * key) {
|
||||
// return -1 if key not found
|
||||
int keyfound = -1;
|
||||
|
||||
@@ -20135,85 +20211,85 @@ int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
||||
return keyfound;
|
||||
}
|
||||
|
||||
const char * gguf_get_key(struct gguf_context * ctx, int i) {
|
||||
const char * gguf_get_key(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].key.data;
|
||||
}
|
||||
|
||||
enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
|
||||
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].type;
|
||||
}
|
||||
|
||||
enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
|
||||
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.arr.type;
|
||||
}
|
||||
|
||||
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
|
||||
const void * gguf_get_arr_data(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.arr.data;
|
||||
}
|
||||
|
||||
const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
|
||||
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
|
||||
struct gguf_kv * kv = &ctx->kv[key_id];
|
||||
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
||||
return str->data;
|
||||
}
|
||||
|
||||
int gguf_get_arr_n(struct gguf_context * ctx, int i) {
|
||||
int gguf_get_arr_n(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.arr.n;
|
||||
}
|
||||
|
||||
uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
|
||||
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.uint8;
|
||||
}
|
||||
|
||||
int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
|
||||
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.int8;
|
||||
}
|
||||
|
||||
uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
|
||||
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.uint16;
|
||||
}
|
||||
|
||||
int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
|
||||
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.int16;
|
||||
}
|
||||
|
||||
uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
|
||||
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.uint32;
|
||||
}
|
||||
|
||||
int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
|
||||
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.int32;
|
||||
}
|
||||
|
||||
float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
||||
float gguf_get_val_f32(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.float32;
|
||||
}
|
||||
|
||||
uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
|
||||
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.uint64;
|
||||
}
|
||||
|
||||
int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
|
||||
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.int64;
|
||||
}
|
||||
|
||||
double gguf_get_val_f64(struct gguf_context * ctx, int i) {
|
||||
double gguf_get_val_f64(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.float64;
|
||||
}
|
||||
|
||||
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
||||
bool gguf_get_val_bool(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.bool_;
|
||||
}
|
||||
|
||||
const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
|
||||
const char * gguf_get_val_str (const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.str.data;
|
||||
}
|
||||
|
||||
int gguf_get_n_tensors(struct gguf_context * ctx) {
|
||||
int gguf_get_n_tensors(const struct gguf_context * ctx) {
|
||||
return ctx->header.n_tensors;
|
||||
}
|
||||
|
||||
int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
||||
int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
|
||||
// return -1 if tensor not found
|
||||
int tensorfound = -1;
|
||||
|
||||
@@ -20229,11 +20305,11 @@ int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
||||
return tensorfound;
|
||||
}
|
||||
|
||||
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
|
||||
size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
|
||||
return ctx->infos[i].offset;
|
||||
}
|
||||
|
||||
char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
|
||||
char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
|
||||
return ctx->infos[i].name.data;
|
||||
}
|
||||
|
||||
@@ -20516,7 +20592,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
|
||||
buf->offset += el_size;
|
||||
}
|
||||
|
||||
static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
||||
static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
||||
// write header
|
||||
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
|
||||
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
|
||||
@@ -20631,7 +20707,7 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
|
||||
}
|
||||
}
|
||||
|
||||
void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
|
||||
void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
|
||||
FILE * file = fopen(fname, "wb");
|
||||
if (!file) {
|
||||
GGML_ASSERT(false && "failed to open file for writing");
|
||||
@@ -20648,7 +20724,7 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only
|
||||
fclose(file);
|
||||
}
|
||||
|
||||
size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
||||
size_t gguf_get_meta_size(const struct gguf_context * ctx) {
|
||||
// no allocs - only compute size
|
||||
struct gguf_buf buf = gguf_buf_init(0);
|
||||
|
||||
@@ -20657,7 +20733,7 @@ size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
||||
return buf.offset;
|
||||
}
|
||||
|
||||
void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
|
||||
void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
|
||||
struct gguf_buf buf = gguf_buf_init(16*1024);
|
||||
|
||||
gguf_write_to_buf(ctx, &buf, true);
|
||||
@@ -20733,6 +20809,14 @@ int ggml_cpu_has_arm_fma(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_metal(void) {
|
||||
#if defined(GGML_USE_METAL)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_f16c(void) {
|
||||
#if defined(__F16C__)
|
||||
return 1;
|
||||
|
||||
126
ggml.h
126
ggml.h
@@ -195,6 +195,14 @@
|
||||
# define GGML_DEPRECATED(func, hint) func
|
||||
#endif
|
||||
|
||||
#ifndef __GNUC__
|
||||
# define GGML_ATTRIBUTE_FORMAT(...)
|
||||
#elif defined(__MINGW32__)
|
||||
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
#else
|
||||
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
@@ -270,7 +278,7 @@ extern "C" {
|
||||
|
||||
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
||||
typedef half ggml_fp16_t;
|
||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
#elif defined(__ARM_NEON)
|
||||
typedef __fp16 ggml_fp16_t;
|
||||
#else
|
||||
typedef uint16_t ggml_fp16_t;
|
||||
@@ -437,6 +445,12 @@ extern "C" {
|
||||
GGML_OBJECT_WORK_BUFFER
|
||||
};
|
||||
|
||||
enum ggml_log_level {
|
||||
GGML_LOG_LEVEL_ERROR = 2,
|
||||
GGML_LOG_LEVEL_WARN = 3,
|
||||
GGML_LOG_LEVEL_INFO = 4
|
||||
};
|
||||
|
||||
// ggml object
|
||||
struct ggml_object {
|
||||
size_t offs;
|
||||
@@ -685,6 +699,7 @@ extern "C" {
|
||||
|
||||
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
||||
GGML_ATTRIBUTE_FORMAT(2, 3)
|
||||
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
|
||||
|
||||
//
|
||||
@@ -1040,7 +1055,6 @@ extern "C" {
|
||||
size_t nb1,
|
||||
size_t offset);
|
||||
|
||||
|
||||
// a -> b, return view(b)
|
||||
GGML_API struct ggml_tensor * ggml_cpy(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1063,6 +1077,33 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// make contiguous, with new shape
|
||||
GGML_API struct ggml_tensor * ggml_cont_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_cont_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_cont_3d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_cont_4d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int64_t ne3);
|
||||
|
||||
// return view(a), b specifies the new shape
|
||||
// TODO: when we start computing gradient, make a copy instead of view
|
||||
GGML_API struct ggml_tensor * ggml_reshape(
|
||||
@@ -1210,14 +1251,15 @@ extern "C" {
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// rotary position embedding
|
||||
// if mode & 1 == 1, skip n_past elements
|
||||
// if mode & 1 == 1, skip n_past elements (DEPRECATED)
|
||||
// if mode & 2 == 1, GPT-NeoX style
|
||||
// if mode & 4 == 1, ChatGLM style
|
||||
// TODO: avoid creating a new tensor every time
|
||||
//
|
||||
// b is an int32 vector with size a->ne[2], it contains the positions
|
||||
GGML_API struct ggml_tensor * ggml_rope(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
struct ggml_tensor * b,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx);
|
||||
@@ -1226,7 +1268,7 @@ extern "C" {
|
||||
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
struct ggml_tensor * b,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx);
|
||||
@@ -1235,7 +1277,7 @@ extern "C" {
|
||||
GGML_API struct ggml_tensor * ggml_rope_custom(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
struct ggml_tensor * b,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
@@ -1246,7 +1288,7 @@ extern "C" {
|
||||
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
struct ggml_tensor * b,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
@@ -1257,7 +1299,7 @@ extern "C" {
|
||||
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
struct ggml_tensor * b,
|
||||
int n_dims,
|
||||
float base,
|
||||
bool down);
|
||||
@@ -1267,7 +1309,7 @@ extern "C" {
|
||||
GGML_API struct ggml_tensor * ggml_rope_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
struct ggml_tensor * b,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
@@ -1682,6 +1724,7 @@ extern "C" {
|
||||
};
|
||||
|
||||
typedef void (*ggml_opt_callback)(void * data, float * sched);
|
||||
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
||||
|
||||
// optimization parameters
|
||||
//
|
||||
@@ -1866,39 +1909,39 @@ extern "C" {
|
||||
|
||||
GGML_API const char * gguf_type_name(enum gguf_type type);
|
||||
|
||||
GGML_API int gguf_get_version (struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
|
||||
GGML_API void * gguf_get_data (struct gguf_context * ctx);
|
||||
GGML_API int gguf_get_version (const struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
|
||||
GGML_API void * gguf_get_data (const struct gguf_context * ctx);
|
||||
|
||||
GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
|
||||
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
|
||||
GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
|
||||
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);
|
||||
|
||||
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
|
||||
GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
|
||||
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
|
||||
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);
|
||||
|
||||
// results are undefined if the wrong type is used for the key
|
||||
GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
|
||||
GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
|
||||
GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
|
||||
GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i);
|
||||
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
|
||||
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
|
||||
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
|
||||
GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
|
||||
GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
|
||||
GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
|
||||
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
||||
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
||||
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
||||
GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
|
||||
GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
|
||||
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i);
|
||||
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i);
|
||||
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i);
|
||||
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i);
|
||||
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i);
|
||||
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i);
|
||||
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i);
|
||||
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i);
|
||||
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i);
|
||||
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i);
|
||||
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i);
|
||||
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
|
||||
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i);
|
||||
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
|
||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
||||
|
||||
GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
|
||||
GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
|
||||
GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
|
||||
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
||||
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
||||
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
||||
|
||||
// overrides existing values or adds a new one
|
||||
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
||||
@@ -1943,11 +1986,11 @@ extern "C" {
|
||||
//
|
||||
|
||||
// write the entire context to a binary file
|
||||
GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||
GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||
|
||||
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
||||
GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
|
||||
GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
|
||||
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
||||
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
||||
|
||||
//
|
||||
// system info
|
||||
@@ -1961,6 +2004,7 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_fma (void);
|
||||
GGML_API int ggml_cpu_has_neon (void);
|
||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
||||
GGML_API int ggml_cpu_has_metal (void);
|
||||
GGML_API int ggml_cpu_has_f16c (void);
|
||||
GGML_API int ggml_cpu_has_fp16_va (void);
|
||||
GGML_API int ggml_cpu_has_wasm_simd (void);
|
||||
|
||||
@@ -32,7 +32,7 @@ KEY_GENERAL_URL = "general.url"
|
||||
KEY_GENERAL_DESCRIPTION = "general.description"
|
||||
KEY_GENERAL_LICENSE = "general.license"
|
||||
KEY_GENERAL_SOURCE_URL = "general.source.url"
|
||||
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
|
||||
KEY_GENERAL_SOURCE_HF_REPO = "general.source.huggingface.repository"
|
||||
KEY_GENERAL_FILE_TYPE = "general.file_type"
|
||||
|
||||
# LLM
|
||||
@@ -77,12 +77,14 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
|
||||
|
||||
|
||||
class MODEL_ARCH(IntEnum):
|
||||
LLAMA : int = auto()
|
||||
FALCON : int = auto()
|
||||
GPT2 : int = auto()
|
||||
GPTJ : int = auto()
|
||||
GPTNEOX: int = auto()
|
||||
MPT : int = auto()
|
||||
LLAMA : int = auto()
|
||||
FALCON : int = auto()
|
||||
BAICHUAN : int = auto()
|
||||
GPT2 : int = auto()
|
||||
GPTJ : int = auto()
|
||||
GPTNEOX : int = auto()
|
||||
MPT : int = auto()
|
||||
STARCODER : int = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
@@ -106,12 +108,14 @@ class MODEL_TENSOR(IntEnum):
|
||||
|
||||
|
||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.LLAMA: "llama",
|
||||
MODEL_ARCH.FALCON: "falcon",
|
||||
MODEL_ARCH.GPT2: "gpt2",
|
||||
MODEL_ARCH.GPTJ: "gptj",
|
||||
MODEL_ARCH.GPTNEOX: "gptneox",
|
||||
MODEL_ARCH.MPT: "mpt",
|
||||
MODEL_ARCH.LLAMA: "llama",
|
||||
MODEL_ARCH.FALCON: "falcon",
|
||||
MODEL_ARCH.BAICHUAN: "baichuan",
|
||||
MODEL_ARCH.GPT2: "gpt2",
|
||||
MODEL_ARCH.GPTJ: "gptj",
|
||||
MODEL_ARCH.GPTNEOX: "gptneox",
|
||||
MODEL_ARCH.MPT: "mpt",
|
||||
MODEL_ARCH.STARCODER: "starcoder",
|
||||
}
|
||||
|
||||
MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
|
||||
@@ -153,6 +157,34 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
},
|
||||
MODEL_ARCH.BAICHUAN: {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
},
|
||||
MODEL_ARCH.STARCODER: {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
},
|
||||
MODEL_ARCH.GPT2: {
|
||||
# TODO
|
||||
},
|
||||
@@ -165,6 +197,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
MODEL_ARCH.BAICHUAN: [
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@@ -187,7 +223,7 @@ class TensorNameMap:
|
||||
# Output
|
||||
MODEL_TENSOR.OUTPUT: (
|
||||
"embed_out", # gptneox
|
||||
"lm_head", # gpt2 mpt falcon llama-hf
|
||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
||||
"output", # llama-pth
|
||||
),
|
||||
|
||||
@@ -195,7 +231,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.OUTPUT_NORM: (
|
||||
"gpt_neox.final_layer_norm", # gptneox
|
||||
"transformer.ln_f", # gpt2 falcon
|
||||
"model.norm", # llama-hf
|
||||
"model.norm", # llama-hf baichuan
|
||||
"norm", # llama-pth
|
||||
),
|
||||
|
||||
@@ -311,6 +347,7 @@ class TensorNameMap:
|
||||
tensor_name = tensor_names.get(tensor)
|
||||
if tensor_name is None:
|
||||
continue
|
||||
mapping[tensor_name] = (tensor, tensor_name)
|
||||
for key in keys:
|
||||
mapping[key] = (tensor, tensor_name)
|
||||
for bid in range(n_blocks):
|
||||
@@ -319,11 +356,12 @@ class TensorNameMap:
|
||||
if tensor_name is None:
|
||||
continue
|
||||
tensor_name = tensor_name.format(bid = bid)
|
||||
mapping[tensor_name] = (tensor, tensor_name)
|
||||
for key in keys:
|
||||
key = key.format(bid = bid)
|
||||
mapping[key] = (tensor, tensor_name)
|
||||
|
||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> tuple[MODEL_TENSOR, str] | None:
|
||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
||||
result = self.mapping.get(key)
|
||||
if result is not None:
|
||||
return result
|
||||
@@ -334,13 +372,13 @@ class TensorNameMap:
|
||||
return (result[0], result[1] + suffix)
|
||||
return None
|
||||
|
||||
def get_name(self, key: str, try_suffixes: Sequence[str]) -> str | None:
|
||||
def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
|
||||
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
||||
if result is None:
|
||||
return None
|
||||
return result[1]
|
||||
|
||||
def get_type(self, key: str, try_suffixes: Sequence[str]) -> MODEL_TENSOR | None:
|
||||
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
|
||||
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
||||
if result is None:
|
||||
return None
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.3.2"
|
||||
version = "0.3.3"
|
||||
description = "Write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
|
||||
348
llama.h
348
llama.h
@@ -37,6 +37,8 @@
|
||||
|
||||
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
||||
|
||||
#define LLAMA_MAX_RNG_STATE (64*1024)
|
||||
|
||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
@@ -60,13 +62,9 @@ extern "C" {
|
||||
struct llama_model;
|
||||
struct llama_context;
|
||||
|
||||
typedef int llama_token;
|
||||
|
||||
enum llama_log_level {
|
||||
LLAMA_LOG_LEVEL_ERROR = 2,
|
||||
LLAMA_LOG_LEVEL_WARN = 3,
|
||||
LLAMA_LOG_LEVEL_INFO = 4
|
||||
};
|
||||
typedef int32_t llama_pos;
|
||||
typedef int32_t llama_token;
|
||||
typedef int32_t llama_seq_id;
|
||||
|
||||
enum llama_vocab_type {
|
||||
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
||||
@@ -86,24 +84,24 @@ extern "C" {
|
||||
// model file types
|
||||
enum llama_ftype {
|
||||
LLAMA_FTYPE_ALL_F32 = 0,
|
||||
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
@@ -122,6 +120,35 @@ extern "C" {
|
||||
|
||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||
|
||||
// Input data for llama_decode
|
||||
// A llama_batch object can contain input about one or many sequences
|
||||
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
||||
//
|
||||
// - token : the token ids of the input (used when embd is NULL)
|
||||
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
||||
// - pos : the positions of the respective token in the sequence
|
||||
// - seq_id : the sequence to which the respective token belongs
|
||||
// - logits : if zero, the logits for the respective token will not be output
|
||||
//
|
||||
typedef struct llama_batch {
|
||||
int32_t n_tokens;
|
||||
|
||||
llama_token * token;
|
||||
float * embd;
|
||||
llama_pos * pos;
|
||||
llama_seq_id * seq_id;
|
||||
int8_t * logits;
|
||||
|
||||
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
||||
// for future-proof code, use the above fields instead and ignore everything below
|
||||
//
|
||||
// pos[i] = all_pos_0 + i*all_pos_1
|
||||
//
|
||||
llama_pos all_pos_0; // used if pos == NULL
|
||||
llama_pos all_pos_1; // used if pos == NULL
|
||||
llama_seq_id all_seq_id; // used if seq_id == NULL
|
||||
} llama_batch;
|
||||
|
||||
struct llama_context_params {
|
||||
uint32_t seed; // RNG seed, -1 for random
|
||||
int32_t n_ctx; // text context
|
||||
@@ -151,13 +178,6 @@ extern "C" {
|
||||
bool embedding; // embedding mode only
|
||||
};
|
||||
|
||||
// Signature for logging events
|
||||
// Note that text includes the new line character at the end for most events.
|
||||
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
||||
// if it exists.
|
||||
// It might not exist for progress report where '.' is output repeatedly.
|
||||
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
||||
|
||||
// model quantization parameters
|
||||
typedef struct llama_model_quantize_params {
|
||||
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
@@ -215,6 +235,7 @@ extern "C" {
|
||||
int32_t n_eval;
|
||||
};
|
||||
|
||||
// Helpers for getting default parameters
|
||||
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
||||
|
||||
@@ -259,8 +280,10 @@ extern "C" {
|
||||
|
||||
// Get a string describing the model type
|
||||
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
||||
|
||||
// Returns the total size of all the tensors in the model in bytes
|
||||
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
||||
|
||||
// Returns the total number of parameters in the model
|
||||
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
||||
|
||||
@@ -281,7 +304,7 @@ extern "C" {
|
||||
const char * path_lora,
|
||||
const char * path_base_model,
|
||||
int n_threads),
|
||||
"please use llama_model_apply_lora_from_file instead");
|
||||
"use llama_model_apply_lora_from_file instead");
|
||||
|
||||
LLAMA_API int llama_model_apply_lora_from_file(
|
||||
const struct llama_model * model,
|
||||
@@ -289,11 +312,53 @@ extern "C" {
|
||||
const char * path_base_model,
|
||||
int n_threads);
|
||||
|
||||
// Returns the number of tokens in the KV cache
|
||||
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
||||
//
|
||||
// KV cache
|
||||
//
|
||||
|
||||
// Sets the current rng seed.
|
||||
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
||||
// Returns the number of tokens in the KV cache
|
||||
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
||||
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
||||
|
||||
// Remove all tokens data of cells in [c0, c1)
|
||||
LLAMA_API void llama_kv_cache_tokens_rm(
|
||||
struct llama_context * ctx,
|
||||
int32_t c0,
|
||||
int32_t c1);
|
||||
|
||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
LLAMA_API void llama_kv_cache_seq_rm(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1);
|
||||
|
||||
// Copy all tokens that belong to the specified sequence to another sequence
|
||||
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
||||
LLAMA_API void llama_kv_cache_seq_cp(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id_src,
|
||||
llama_seq_id seq_id_dst,
|
||||
llama_pos p0,
|
||||
llama_pos p1);
|
||||
|
||||
// Removes all tokens that do not belong to the specified sequence
|
||||
LLAMA_API void llama_kv_cache_seq_keep(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id);
|
||||
|
||||
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly
|
||||
LLAMA_API void llama_kv_cache_seq_shift(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
llama_pos delta);
|
||||
|
||||
//
|
||||
// State / sessions
|
||||
//
|
||||
|
||||
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
||||
// and kv_cache) - will often be smaller after compacting tokens
|
||||
@@ -302,48 +367,100 @@ extern "C" {
|
||||
// Copies the state to the specified destination address.
|
||||
// Destination needs to have allocated enough memory.
|
||||
// Returns the number of bytes copied
|
||||
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
|
||||
LLAMA_API size_t llama_copy_state_data(
|
||||
struct llama_context * ctx,
|
||||
uint8_t * dst);
|
||||
|
||||
// Set the state reading from the specified address
|
||||
// Returns the number of bytes read
|
||||
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
|
||||
LLAMA_API size_t llama_set_state_data(
|
||||
struct llama_context * ctx,
|
||||
uint8_t * src);
|
||||
|
||||
// Save/load session file
|
||||
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
||||
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
|
||||
LLAMA_API bool llama_load_session_file(
|
||||
struct llama_context * ctx,
|
||||
const char * path_session,
|
||||
llama_token * tokens_out,
|
||||
size_t n_token_capacity,
|
||||
size_t * n_token_count_out);
|
||||
|
||||
// Run the llama inference to obtain the logits and probabilities for the next token.
|
||||
LLAMA_API bool llama_save_session_file(
|
||||
struct llama_context * ctx,
|
||||
const char * path_session,
|
||||
const llama_token * tokens,
|
||||
size_t n_token_count);
|
||||
|
||||
//
|
||||
// Decoding
|
||||
//
|
||||
|
||||
// Run the llama inference to obtain the logits and probabilities for the next token(s).
|
||||
// tokens + n_tokens is the provided batch of new tokens to process
|
||||
// n_past is the number of tokens to use from previous eval calls
|
||||
// Returns 0 on success
|
||||
LLAMA_API int llama_eval(
|
||||
// DEPRECATED: use llama_decode() instead
|
||||
LLAMA_API DEPRECATED(int llama_eval(
|
||||
struct llama_context * ctx,
|
||||
const llama_token * tokens,
|
||||
int n_tokens,
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
int n_past,
|
||||
int n_threads);
|
||||
int n_threads),
|
||||
"use llama_decode() instead");
|
||||
|
||||
// Same as llama_eval, but use float matrix input directly.
|
||||
LLAMA_API int llama_eval_embd(
|
||||
// DEPRECATED: use llama_decode() instead
|
||||
LLAMA_API DEPRECATED(int llama_eval_embd(
|
||||
struct llama_context * ctx,
|
||||
const float * embd,
|
||||
int n_tokens,
|
||||
float * embd,
|
||||
int32_t n_tokens,
|
||||
int n_past,
|
||||
int n_threads);
|
||||
int n_threads),
|
||||
"use llama_decode() instead");
|
||||
|
||||
// Export a static computation graph for context of 511 and batch size of 1
|
||||
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
|
||||
// parameters here to keep things simple
|
||||
// IMPORTANT: do not use for anything else other than debugging and testing!
|
||||
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
||||
// Return batch for single sequence of tokens starting at pos_0
|
||||
//
|
||||
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
|
||||
//
|
||||
LLAMA_API struct llama_batch llama_batch_get_one(
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
llama_pos pos_0,
|
||||
llama_seq_id seq_id);
|
||||
|
||||
// Allocates a batch of tokens on the heap
|
||||
// The batch has to be freed with llama_batch_free()
|
||||
// If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
|
||||
// Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
|
||||
// The rest of the llama_batch members are allocated with size n_tokens
|
||||
// All members are left uninitialized
|
||||
LLAMA_API struct llama_batch llama_batch_init(
|
||||
int32_t n_tokens,
|
||||
int32_t embd);
|
||||
|
||||
// Frees a batch of tokens allocated with llama_batch_init()
|
||||
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
||||
|
||||
// Positive return values does not mean a fatal error, but rather a warning.
|
||||
// 0 - success
|
||||
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
||||
// < 0 - error
|
||||
LLAMA_API int llama_decode(
|
||||
struct llama_context * ctx,
|
||||
struct llama_batch batch,
|
||||
int n_threads);
|
||||
|
||||
// Token logits obtained from the last call to llama_eval()
|
||||
// The logits for the last token are stored in the last row
|
||||
// Can be mutated in order to change the probabilities of the next token
|
||||
// Rows: n_tokens
|
||||
// Logits for which llama_batch.logits[i] == 0 are undefined
|
||||
// Rows: n_tokens provided with llama_batch
|
||||
// Cols: n_vocab
|
||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
|
||||
// Logits for the ith token. Equivalent to:
|
||||
// llama_get_logits(ctx) + i*n_vocab
|
||||
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
// Get the embeddings for the input
|
||||
// shape: [n_embd] (1-dimensional)
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
@@ -374,6 +491,7 @@ extern "C" {
|
||||
LLAMA_API int llama_tokenize(
|
||||
struct llama_context * ctx,
|
||||
const char * text,
|
||||
int text_len,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
@@ -381,6 +499,7 @@ extern "C" {
|
||||
LLAMA_API int llama_tokenize_with_model(
|
||||
const struct llama_model * model,
|
||||
const char * text,
|
||||
int text_len,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
@@ -418,11 +537,25 @@ extern "C" {
|
||||
// Sampling functions
|
||||
//
|
||||
|
||||
// Sets the current rng seed.
|
||||
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
||||
|
||||
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
||||
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
||||
LLAMA_API void llama_sample_repetition_penalty(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
const llama_token * last_tokens,
|
||||
size_t last_tokens_size,
|
||||
float penalty);
|
||||
|
||||
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
||||
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
||||
LLAMA_API void llama_sample_frequency_and_presence_penalties(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
const llama_token * last_tokens,
|
||||
size_t last_tokens_size,
|
||||
float alpha_frequency,
|
||||
float alpha_presence);
|
||||
|
||||
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
||||
@@ -435,23 +568,54 @@ extern "C" {
|
||||
float scale);
|
||||
|
||||
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
||||
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||
LLAMA_API void llama_sample_softmax(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates);
|
||||
|
||||
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
|
||||
LLAMA_API void llama_sample_top_k(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
int k,
|
||||
size_t min_keep);
|
||||
|
||||
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
||||
LLAMA_API void llama_sample_top_p(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
float p,
|
||||
size_t min_keep);
|
||||
|
||||
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
||||
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
|
||||
LLAMA_API void llama_sample_tail_free(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
float z,
|
||||
size_t min_keep);
|
||||
|
||||
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
||||
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
||||
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
||||
LLAMA_API void llama_sample_typical(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
float p,
|
||||
size_t min_keep);
|
||||
|
||||
LLAMA_API void llama_sample_temp(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
float temp);
|
||||
|
||||
LLAMA_API DEPRECATED(void llama_sample_temperature(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
float temp),
|
||||
"use llama_sample_temp instead");
|
||||
|
||||
/// @details Apply constraints from grammar
|
||||
LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
|
||||
LLAMA_API void llama_sample_grammar(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
const struct llama_grammar * grammar);
|
||||
|
||||
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
||||
@@ -459,23 +623,41 @@ extern "C" {
|
||||
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
||||
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
||||
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
||||
LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
|
||||
LLAMA_API llama_token llama_sample_token_mirostat(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
float tau,
|
||||
float eta,
|
||||
int m,
|
||||
float * mu);
|
||||
|
||||
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
||||
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
||||
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
||||
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
||||
LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
|
||||
LLAMA_API llama_token llama_sample_token_mirostat_v2(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
float tau,
|
||||
float eta,
|
||||
float * mu);
|
||||
|
||||
/// @details Selects the token with the highest probability.
|
||||
LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||
LLAMA_API llama_token llama_sample_token_greedy(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates);
|
||||
|
||||
/// @details Randomly selects a token from the candidates based on their probabilities.
|
||||
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||
LLAMA_API llama_token llama_sample_token(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates);
|
||||
|
||||
/// @details Accepts the sampled token into the grammar
|
||||
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
||||
LLAMA_API void llama_grammar_accept_token(
|
||||
struct llama_context * ctx,
|
||||
struct llama_grammar * grammar,
|
||||
llama_token token);
|
||||
|
||||
//
|
||||
// Beam search
|
||||
@@ -483,9 +665,10 @@ extern "C" {
|
||||
|
||||
struct llama_beam_view {
|
||||
const llama_token * tokens;
|
||||
|
||||
size_t n_tokens;
|
||||
float p; // Cumulative beam probability (renormalized relative to all beams)
|
||||
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
||||
float p; // Cumulative beam probability (renormalized relative to all beams)
|
||||
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
||||
};
|
||||
|
||||
// Passed to beam_search_callback function.
|
||||
@@ -494,9 +677,10 @@ extern "C" {
|
||||
// These pointers are valid only during the synchronous callback, so should not be saved.
|
||||
struct llama_beams_state {
|
||||
struct llama_beam_view * beam_views;
|
||||
|
||||
size_t n_beams; // Number of elements in beam_views[].
|
||||
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
||||
bool last_call; // True iff this is the last callback invocation.
|
||||
bool last_call; // True iff this is the last callback invocation.
|
||||
};
|
||||
|
||||
// Type of pointer to the beam_search_callback function.
|
||||
@@ -512,10 +696,18 @@ extern "C" {
|
||||
/// @param n_past Number of tokens already evaluated.
|
||||
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
||||
/// @param n_threads Number of threads as passed to llama_eval().
|
||||
LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
|
||||
LLAMA_API void llama_beam_search(
|
||||
struct llama_context * ctx,
|
||||
llama_beam_search_callback_fn_t callback,
|
||||
void * callback_data,
|
||||
size_t n_beams,
|
||||
int n_past,
|
||||
int n_predict,
|
||||
int n_threads);
|
||||
|
||||
// Performance information
|
||||
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
||||
|
||||
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
||||
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
||||
|
||||
@@ -524,7 +716,7 @@ extern "C" {
|
||||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
||||
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
||||
|
||||
@@ -540,7 +732,9 @@ extern "C" {
|
||||
|
||||
struct ggml_tensor;
|
||||
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
||||
struct llama_context * ctx
|
||||
);
|
||||
|
||||
#endif // LLAMA_API_INTERNAL
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
|
||||
constexpr int kVecSize = 1 << 18;
|
||||
|
||||
float drawFromGaussianPdf(std::mt19937& rndm) {
|
||||
static float drawFromGaussianPdf(std::mt19937& rndm) {
|
||||
constexpr double kScale = 1./(1. + std::mt19937::max());
|
||||
constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale;
|
||||
static float lastX;
|
||||
@@ -28,7 +28,8 @@ float drawFromGaussianPdf(std::mt19937& rndm) {
|
||||
haveX = true;
|
||||
return r*cos(phi);
|
||||
}
|
||||
void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
|
||||
|
||||
static void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
|
||||
for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
|
||||
}
|
||||
|
||||
|
||||
4
prompts/chat-with-baichuan.txt
Normal file
4
prompts/chat-with-baichuan.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
以下内容为人类用户与与一位智能助手的对话。
|
||||
|
||||
用户:你好!
|
||||
助手:
|
||||
@@ -13,7 +13,7 @@ CLI_ARGS_MAIN_PERPLEXITY = [
|
||||
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
|
||||
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
|
||||
"low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
|
||||
"model", "mtest", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
|
||||
"model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
|
||||
"np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
|
||||
"prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
|
||||
"repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
|
||||
|
||||
69
scripts/LlamaConfig.cmake.in
Normal file
69
scripts/LlamaConfig.cmake.in
Normal file
@@ -0,0 +1,69 @@
|
||||
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
|
||||
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
||||
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
||||
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||
set(LLAMA_BLAS @LLAMA_BLAS@)
|
||||
set(LLAMA_CUBLAS @LLAMA_CUBLAS@)
|
||||
set(LLAMA_METAL @LLAMA_METAL@)
|
||||
set(LLAMA_MPI @LLAMA_MPI@)
|
||||
set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
|
||||
set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
|
||||
set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
|
||||
|
||||
@PACKAGE_INIT@
|
||||
|
||||
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
||||
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
||||
|
||||
# Ensure transient dependencies satisfied
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
if (APPLE AND LLAMA_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BLAS)
|
||||
find_package(BLAS REQUIRED)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CUBLAS)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
endif()
|
||||
|
||||
if (LLAMA_METAL)
|
||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
endif()
|
||||
|
||||
if (LLAMA_MPI)
|
||||
find_package(MPI REQUIRED)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CLBLAST)
|
||||
find_package(CLBlast REQUIRED)
|
||||
endif()
|
||||
|
||||
if (LLAMA_HIPBLAS)
|
||||
find_package(hip REQUIRED)
|
||||
find_package(hipblas REQUIRED)
|
||||
find_package(rocblas REQUIRED)
|
||||
endif()
|
||||
|
||||
find_library(llama_LIBRARY llama
|
||||
REQUIRED
|
||||
HINTS ${LLAMA_LIB_DIR})
|
||||
|
||||
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
|
||||
add_library(llama UNKNOWN IMPORTED)
|
||||
set_target_properties(llama
|
||||
PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
||||
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${llama_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES cxx_std_11
|
||||
POSITION_INDEPENDENT_CODE ON )
|
||||
|
||||
check_required_components(Llama)
|
||||
@@ -2,20 +2,18 @@ set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in")
|
||||
set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
|
||||
set(BUILD_NUMBER 0)
|
||||
set(BUILD_COMMIT "unknown")
|
||||
set(BUILD_COMPILER "unknown")
|
||||
set(BUILD_TARGET "unknown")
|
||||
|
||||
# Look for git
|
||||
find_package(Git)
|
||||
if(NOT Git_FOUND)
|
||||
execute_process(
|
||||
COMMAND which git
|
||||
OUTPUT_VARIABLE GIT_EXECUTABLE
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
if(NOT GIT_EXECUTABLE STREQUAL "")
|
||||
find_program(GIT_EXECUTABLE NAMES git git.exe)
|
||||
if(GIT_EXECUTABLE)
|
||||
set(Git_FOUND TRUE)
|
||||
message(STATUS "Found Git using 'which': ${GIT_EXECUTABLE}")
|
||||
message(STATUS "Found Git: ${GIT_EXECUTABLE}")
|
||||
else()
|
||||
message(WARNING "Git not found using 'find_package' or 'which'. Build info will not be accurate. Consider installing Git or ensuring it is in the PATH.")
|
||||
message(WARNING "Git not found. Build info will not be accurate.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -26,26 +24,49 @@ if(Git_FOUND)
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE HEAD
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
RESULT_VARIABLE GIT_HEAD_RESULT
|
||||
)
|
||||
execute_process(
|
||||
COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE COUNT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
RESULT_VARIABLE GIT_COUNT_RESULT
|
||||
)
|
||||
if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0)
|
||||
set(BUILD_COMMIT ${HEAD})
|
||||
set(BUILD_NUMBER ${COUNT})
|
||||
endif()
|
||||
set(BUILD_COMMIT ${HEAD})
|
||||
set(BUILD_NUMBER ${COUNT})
|
||||
endif()
|
||||
|
||||
if(MSVC)
|
||||
set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
|
||||
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
|
||||
else()
|
||||
execute_process(
|
||||
COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
|
||||
OUTPUT_VARIABLE OUT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
set(BUILD_COMPILER ${OUT})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_C_COMPILER} -dumpmachine
|
||||
OUTPUT_VARIABLE OUT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
set(BUILD_TARGET ${OUT})
|
||||
endif()
|
||||
|
||||
# Only write the header if it's changed to prevent unnecessary recompilation
|
||||
if(EXISTS ${HEADER_FILE})
|
||||
file(STRINGS ${HEADER_FILE} CONTENTS REGEX "BUILD_COMMIT \"([^\"]*)\"")
|
||||
list(GET CONTENTS 0 EXISTING)
|
||||
if(NOT EXISTING STREQUAL "#define BUILD_COMMIT \"${BUILD_COMMIT}\"")
|
||||
file(READ ${HEADER_FILE} CONTENTS)
|
||||
string(REGEX MATCH "BUILD_COMMIT \"([^\"]*)\"" _ ${CONTENTS})
|
||||
set(OLD_COMMIT ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "BUILD_COMPILER \"([^\"]*)\"" _ ${CONTENTS})
|
||||
set(OLD_COMPILER ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "BUILD_TARGET \"([^\"]*)\"" _ ${CONTENTS})
|
||||
set(OLD_TARGET ${CMAKE_MATCH_1})
|
||||
if (
|
||||
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
|
||||
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
|
||||
NOT OLD_TARGET STREQUAL BUILD_TARGET
|
||||
)
|
||||
configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
|
||||
endif()
|
||||
else()
|
||||
|
||||
@@ -3,5 +3,7 @@
|
||||
|
||||
#define BUILD_NUMBER @BUILD_NUMBER@
|
||||
#define BUILD_COMMIT "@BUILD_COMMIT@"
|
||||
#define BUILD_COMPILER "@BUILD_COMPILER@"
|
||||
#define BUILD_TARGET "@BUILD_TARGET@"
|
||||
|
||||
#endif // BUILD_INFO_H
|
||||
|
||||
@@ -1,23 +1,35 @@
|
||||
#!/bin/sh
|
||||
|
||||
BUILD_NUMBER="0"
|
||||
BUILD_COMMIT="unknown"
|
||||
CC=$1
|
||||
|
||||
REV_LIST=$(git rev-list --count HEAD)
|
||||
if [ $? -eq 0 ]; then
|
||||
BUILD_NUMBER=$REV_LIST
|
||||
build_number="0"
|
||||
build_commit="unknown"
|
||||
build_compiler="unknown"
|
||||
build_target="unknown"
|
||||
|
||||
if out=$(git rev-list --count HEAD); then
|
||||
# git is broken on WSL so we need to strip extra newlines
|
||||
build_number=$(printf '%s' "$out" | tr -d '\n')
|
||||
fi
|
||||
|
||||
REV_PARSE=$(git rev-parse --short HEAD)
|
||||
if [ $? -eq 0 ]; then
|
||||
BUILD_COMMIT=$REV_PARSE
|
||||
if out=$(git rev-parse --short HEAD); then
|
||||
build_commit=$(printf '%s' "$out" | tr -d '\n')
|
||||
fi
|
||||
|
||||
if out=$($CC --version | head -1); then
|
||||
build_compiler=$out
|
||||
fi
|
||||
|
||||
if out=$($CC -dumpmachine); then
|
||||
build_target=$out
|
||||
fi
|
||||
|
||||
echo "#ifndef BUILD_INFO_H"
|
||||
echo "#define BUILD_INFO_H"
|
||||
echo ""
|
||||
echo "#define BUILD_NUMBER $BUILD_NUMBER" | tr -d '\n'
|
||||
echo ""
|
||||
echo "#define BUILD_COMMIT \"$BUILD_COMMIT\"" | tr -d '\n'
|
||||
echo ""
|
||||
echo
|
||||
echo "#define BUILD_NUMBER $build_number"
|
||||
echo "#define BUILD_COMMIT \"$build_commit\""
|
||||
echo "#define BUILD_COMPILER \"$build_compiler\""
|
||||
echo "#define BUILD_TARGET \"$build_target\""
|
||||
echo
|
||||
echo "#endif // BUILD_INFO_H"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/env python3
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
|
||||
@@ -29,15 +29,16 @@ llama_build_executable(test-tokenizer-0-llama.cpp)
|
||||
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
||||
#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_build_executable(test-tokenizer-1.cpp)
|
||||
# test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
|
||||
#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_build_executable(test-tokenizer-1-llama.cpp)
|
||||
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
||||
# llama_build_and_test_executable(test-opt.cpp) # SLOW
|
||||
|
||||
llama_build_and_test_executable(test-rope.cpp)
|
||||
|
||||
# dummy executable - not installed
|
||||
get_filename_component(TEST_TARGET test-c.c NAME_WE)
|
||||
add_executable(${TEST_TARGET} test-c.c)
|
||||
|
||||
@@ -1404,6 +1404,11 @@ int main(int argc, const char ** argv) {
|
||||
for (int n_past = 1; n_past < ne2[2]; ++n_past) {
|
||||
x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
|
||||
|
||||
struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
|
||||
for (int i = 0; i < ne2[2]; ++i) {
|
||||
((int32_t *) p->data)[i] = n_past + i;
|
||||
}
|
||||
|
||||
ggml_set_param(ctx0, x[0]);
|
||||
|
||||
const bool skip_past = (mode & 1);
|
||||
@@ -1415,7 +1420,7 @@ int main(int argc, const char ** argv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
|
||||
|
||||
GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
|
||||
check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
|
||||
@@ -1438,6 +1443,11 @@ int main(int argc, const char ** argv) {
|
||||
for (int n_past = 1; n_past < ne2[2]; ++n_past) {
|
||||
x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
|
||||
|
||||
struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
|
||||
for (int i = 0; i < ne2[2]; ++i) {
|
||||
((int32_t *) p->data)[i] = n_past + i;
|
||||
}
|
||||
|
||||
ggml_set_param(ctx0, x[0]);
|
||||
|
||||
const bool skip_past = (mode & 1);
|
||||
@@ -1449,7 +1459,7 @@ int main(int argc, const char ** argv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
|
||||
|
||||
GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
|
||||
check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
|
||||
|
||||
@@ -36,15 +36,15 @@
|
||||
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
||||
|
||||
|
||||
float frand(void) {
|
||||
static float frand(void) {
|
||||
return (float)rand()/(float)RAND_MAX;
|
||||
}
|
||||
|
||||
int irand(int n) {
|
||||
static int irand(int n) {
|
||||
return rand()%n;
|
||||
}
|
||||
|
||||
void get_random_dims(int64_t * dims, int ndims) {
|
||||
static void get_random_dims(int64_t * dims, int ndims) {
|
||||
dims[0] = dims[1] = dims[2] = dims[3] = 1;
|
||||
|
||||
for (int i = 0; i < ndims; i++) {
|
||||
@@ -52,7 +52,7 @@ void get_random_dims(int64_t * dims, int ndims) {
|
||||
}
|
||||
}
|
||||
|
||||
void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
|
||||
static void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
|
||||
dims[0] = dims[1] = dims[2] = dims[3] = 1;
|
||||
|
||||
for (int i = 0; i < ndims; i++) {
|
||||
@@ -61,12 +61,9 @@ void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * get_random_tensor(
|
||||
struct ggml_context * ctx0,
|
||||
int ndims,
|
||||
int64_t ne[],
|
||||
float fmin,
|
||||
float fmax) {
|
||||
static struct ggml_tensor * get_random_tensor(
|
||||
struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
|
||||
) {
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
|
||||
|
||||
switch (ndims) {
|
||||
@@ -109,11 +106,11 @@ struct ggml_tensor * get_random_tensor(
|
||||
return result;
|
||||
}
|
||||
|
||||
float get_element(const struct ggml_tensor * t, int idx) {
|
||||
static float get_element(const struct ggml_tensor * t, int idx) {
|
||||
return ((float *)t->data)[idx];
|
||||
}
|
||||
|
||||
void set_element(struct ggml_tensor * t, int idx, float value) {
|
||||
static void set_element(struct ggml_tensor * t, int idx, float value) {
|
||||
((float *)t->data)[idx] = value;
|
||||
}
|
||||
|
||||
|
||||
@@ -13,24 +13,24 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
|
||||
const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
|
||||
const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
|
||||
const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
|
||||
const float MAX_DOT_PRODUCT_ERROR = 0.02f;
|
||||
constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
|
||||
constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
|
||||
|
||||
const char* RESULT_STR[] = {"ok", "FAILED"};
|
||||
static const char* RESULT_STR[] = {"ok", "FAILED"};
|
||||
|
||||
|
||||
// Generate synthetic data
|
||||
void generate_data(float offset, size_t n, float * dst) {
|
||||
static void generate_data(float offset, size_t n, float * dst) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
dst[i] = 0.1 + 2*cosf(i + offset);
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate RMSE between two float arrays
|
||||
float array_rmse(const float * a1, const float * a2, size_t n) {
|
||||
static float array_rmse(const float * a1, const float * a2, size_t n) {
|
||||
double sum = 0;
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
double diff = a1[i] - a2[i];
|
||||
@@ -40,7 +40,7 @@ float array_rmse(const float * a1, const float * a2, size_t n) {
|
||||
}
|
||||
|
||||
// Total quantization error on test data
|
||||
float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
||||
static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
||||
std::vector<uint8_t> tmp_q(2*test_size);
|
||||
std::vector<float> tmp_out(test_size);
|
||||
|
||||
@@ -50,7 +50,7 @@ float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, cons
|
||||
}
|
||||
|
||||
// Total quantization error on test data
|
||||
float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
||||
static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
||||
std::vector<uint8_t> tmp_q(2*test_size);
|
||||
std::vector<float> tmp_out(test_size);
|
||||
std::vector<float> tmp_out_ref(test_size);
|
||||
@@ -64,7 +64,7 @@ float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size,
|
||||
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
||||
}
|
||||
|
||||
float dot_product(const float * a1, const float * a2, size_t test_size) {
|
||||
static float dot_product(const float * a1, const float * a2, size_t test_size) {
|
||||
double sum = 0;
|
||||
for (size_t i = 0; i < test_size; i++) {
|
||||
sum += a1[i] * a2[i];
|
||||
@@ -73,7 +73,9 @@ float dot_product(const float * a1, const float * a2, size_t test_size) {
|
||||
}
|
||||
|
||||
// Total dot product error
|
||||
float dot_product_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
|
||||
static float dot_product_error(
|
||||
ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
|
||||
) {
|
||||
std::vector<uint8_t> tmp_q1(2*test_size);
|
||||
std::vector<uint8_t> tmp_q2(2*test_size);
|
||||
|
||||
|
||||
@@ -61,22 +61,22 @@ inline int64_t cpu_cycles() {
|
||||
|
||||
|
||||
// Generate synthetic data
|
||||
void generate_data(float offset, size_t n, float * dst) {
|
||||
static void generate_data(float offset, size_t n, float * dst) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
dst[i] = 0.1 + 2*cosf(i + offset);
|
||||
}
|
||||
}
|
||||
|
||||
float gigabytes_per_second(size_t bytes, int64_t usecs) {
|
||||
static float gigabytes_per_second(size_t bytes, int64_t usecs) {
|
||||
return bytes / (float) usecs * 1000000 / (1024*1024*1024);
|
||||
}
|
||||
|
||||
void * align_with_offset(void * ptr, int offset) {
|
||||
static void * align_with_offset(void * ptr, int offset) {
|
||||
size_t dummy_size = MAX_ALIGNMENT * 4;
|
||||
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
|
||||
}
|
||||
|
||||
void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
|
||||
static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
|
||||
int64_t min_time_us = INT64_MAX;
|
||||
int64_t total_time_us = 0;
|
||||
int64_t min_time_cycles = INT64_MAX;
|
||||
@@ -108,7 +108,7 @@ void benchmark_function(size_t size, size_t q_size, int64_t iterations, const st
|
||||
printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us));
|
||||
}
|
||||
|
||||
void usage(char * argv[]) {
|
||||
static void usage(char * argv[]) {
|
||||
printf("Benchmark quantization specific functions on synthetic data\n");
|
||||
printf("\n");
|
||||
printf("usage: %s [options]\n", argv[0]);
|
||||
|
||||
221
tests/test-rope.cpp
Normal file
221
tests/test-rope.cpp
Normal file
@@ -0,0 +1,221 @@
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Wdouble-promotion"
|
||||
#endif
|
||||
|
||||
#define MAX_NARGS 3
|
||||
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
#define GGML_SILU_FP16
|
||||
|
||||
//
|
||||
// logging
|
||||
//
|
||||
|
||||
#if (GGML_DEBUG >= 1)
|
||||
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG(...)
|
||||
#endif
|
||||
|
||||
#if (GGML_DEBUG >= 5)
|
||||
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG_5(...)
|
||||
#endif
|
||||
|
||||
#if (GGML_DEBUG >= 10)
|
||||
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG_10(...)
|
||||
#endif
|
||||
|
||||
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
||||
|
||||
static float frand(void) {
|
||||
return (float)rand()/(float)RAND_MAX;
|
||||
}
|
||||
|
||||
static int irand(int n) {
|
||||
if (n == 0) return 0;
|
||||
return rand()%n;
|
||||
}
|
||||
|
||||
static void get_random_dims(int64_t * dims, int ndims) {
|
||||
dims[0] = dims[1] = dims[2] = dims[3] = 1;
|
||||
|
||||
for (int i = 0; i < ndims; i++) {
|
||||
dims[i] = 1 + irand(4);
|
||||
}
|
||||
}
|
||||
|
||||
static struct ggml_tensor * get_random_tensor_f32(
|
||||
struct ggml_context * ctx0,
|
||||
int ndims,
|
||||
const int64_t ne[],
|
||||
float fmin,
|
||||
float fmax) {
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
|
||||
|
||||
switch (ndims) {
|
||||
case 1:
|
||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
||||
((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
for (int i1 = 0; i1 < ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
||||
((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
for (int i2 = 0; i2 < ne[2]; i2++) {
|
||||
for (int i1 = 0; i1 < ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
||||
((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
for (int i3 = 0; i3 < ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < ne[2]; i2++) {
|
||||
for (int i1 = 0; i1 < ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
||||
((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
};
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
buf.resize(plan.work_size);
|
||||
plan.work_data = buf.data();
|
||||
}
|
||||
|
||||
ggml_graph_compute(graph, &plan);
|
||||
}
|
||||
|
||||
int main(int /*argc*/, const char ** /*argv*/) {
|
||||
struct ggml_init_params params = {
|
||||
/* .mem_size = */ 128*1024*1024,
|
||||
/* .mem_buffer = */ NULL,
|
||||
/* .no_alloc = */ false,
|
||||
};
|
||||
|
||||
std::vector<uint8_t> work_buffer;
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
struct ggml_tensor * x;
|
||||
|
||||
// rope f32
|
||||
for (int m = 0; m < 3; ++m) {
|
||||
const int ndims = 4;
|
||||
|
||||
const int64_t n_rot = 128;
|
||||
const int64_t ne[4] = { 2*n_rot, 32, 73, 1 };
|
||||
|
||||
const int n_past_0 = 100;
|
||||
const int n_past_2 = 33;
|
||||
|
||||
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
||||
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
||||
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
||||
|
||||
for (int i = 0; i < ne[2]; ++i) {
|
||||
((int32_t *) p0->data)[i] = n_past_0 + i;
|
||||
((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
|
||||
((int32_t *) p2->data)[i] = n_past_2 + i;
|
||||
}
|
||||
|
||||
// test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
|
||||
const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
|
||||
|
||||
x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
|
||||
|
||||
// 100, 101, 102, ..., 172
|
||||
struct ggml_tensor * r0 = ggml_rope(ctx0, x, p0, n_rot, mode, 1024);
|
||||
// -67, -67, -67, ..., -67
|
||||
struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode, 1024); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
|
||||
|
||||
// 33, 34, 35, ..., 105
|
||||
struct ggml_tensor * r2 = ggml_rope(ctx0, x, p2, n_rot, mode, 1024);
|
||||
|
||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
ggml_build_forward_expand(gf, r0);
|
||||
ggml_build_forward_expand(gf, r1);
|
||||
ggml_build_forward_expand(gf, r2);
|
||||
|
||||
ggml_graph_compute_helper(work_buffer, gf, 4);
|
||||
|
||||
// check that r1 and r2 are the same
|
||||
{
|
||||
double sum0 = 0.0f;
|
||||
double sum1 = 0.0f;
|
||||
double diff = 0.0f;
|
||||
|
||||
const float * r1_data = (float *) r1->data;
|
||||
const float * r2_data = (float *) r2->data;
|
||||
|
||||
const int n_elements = ggml_nelements(r1);
|
||||
|
||||
for (int i = 0; i < n_elements; ++i) {
|
||||
sum0 += fabs(r1_data[i]);
|
||||
sum1 += fabs(r2_data[i]);
|
||||
diff += fabs(r1_data[i] - r2_data[i]);
|
||||
//if (fabs(r1_data[i] - r2_data[i]) > 0.0001f) {
|
||||
// printf("%d: %f %f\n", i, r1_data[i], r2_data[i]);
|
||||
// printf("diff: %f\n", fabs(r1_data[i] - r2_data[i]));
|
||||
//}
|
||||
}
|
||||
|
||||
//for (int i = 4096; i < 4096 + 128; ++i) {
|
||||
// printf("%f %f\n", r1_data[i], r2_data[i]);
|
||||
//}
|
||||
|
||||
printf("mode: %d\n", mode);
|
||||
printf("sum0: %f\n", sum0);
|
||||
printf("sum1: %f\n", sum1);
|
||||
printf("diff: %f\n", diff);
|
||||
printf("rel err: %f\n", diff / sum0);
|
||||
printf("rel err: %f\n", diff / sum1);
|
||||
|
||||
GGML_ASSERT(diff / sum0 < 0.0001f);
|
||||
GGML_ASSERT(diff / sum1 < 0.0001f);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -12,7 +12,8 @@
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
void dump(const llama_token_data_array * candidates) {
|
||||
|
||||
static void dump(const llama_token_data_array * candidates) {
|
||||
for (size_t i = 0; i < candidates->size; i++) {
|
||||
printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
|
||||
}
|
||||
@@ -21,9 +22,7 @@ void dump(const llama_token_data_array * candidates) {
|
||||
#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
|
||||
|
||||
|
||||
void test_top_k(const std::vector<float> & probs,
|
||||
const std::vector<float> & expected_probs,
|
||||
int k) {
|
||||
static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
@@ -45,10 +44,7 @@ void test_top_k(const std::vector<float> & probs,
|
||||
}
|
||||
|
||||
|
||||
void test_top_p(const std::vector<float> & probs,
|
||||
const std::vector<float> & expected_probs,
|
||||
float p) {
|
||||
|
||||
static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
@@ -70,9 +66,7 @@ void test_top_p(const std::vector<float> & probs,
|
||||
}
|
||||
|
||||
|
||||
void test_tfs(const std::vector<float> & probs,
|
||||
const std::vector<float> & expected_probs,
|
||||
float z) {
|
||||
static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
@@ -93,9 +87,7 @@ void test_tfs(const std::vector<float> & probs,
|
||||
}
|
||||
|
||||
|
||||
void test_typical(const std::vector<float> & probs,
|
||||
const std::vector<float> & expected_probs,
|
||||
float p) {
|
||||
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
@@ -116,11 +108,10 @@ void test_typical(const std::vector<float> & probs,
|
||||
}
|
||||
|
||||
|
||||
void test_repetition_penalty(
|
||||
const std::vector<float> & probs,
|
||||
const std::vector<llama_token> & last_tokens,
|
||||
const std::vector<float> & expected_probs,
|
||||
float penalty) {
|
||||
static void test_repetition_penalty(
|
||||
const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
|
||||
const std::vector<float> & expected_probs, float penalty
|
||||
) {
|
||||
assert(probs.size() == expected_probs.size());
|
||||
|
||||
size_t n_vocab = probs.size();
|
||||
@@ -145,11 +136,10 @@ void test_repetition_penalty(
|
||||
}
|
||||
|
||||
|
||||
void test_frequency_presence_penalty(
|
||||
const std::vector<float> & probs,
|
||||
const std::vector<llama_token> & last_tokens,
|
||||
const std::vector<float> & expected_probs,
|
||||
float alpha_frequency, float alpha_presence) {
|
||||
static void test_frequency_presence_penalty(
|
||||
const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
|
||||
const std::vector<float> & expected_probs, float alpha_frequency, float alpha_presence
|
||||
) {
|
||||
assert(probs.size() == expected_probs.size());
|
||||
|
||||
size_t n_vocab = probs.size();
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "console.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
@@ -35,6 +36,7 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||
{ " Hello" , { 1678, 15043, }, },
|
||||
{ " Hello" , { 268, 15043, }, },
|
||||
{ " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, },
|
||||
{ " (" , { 29871, 313, }, },
|
||||
};
|
||||
|
||||
return _k_tests;
|
||||
@@ -89,6 +91,12 @@ int main(int argc, char **argv) {
|
||||
return 2;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
// We need this for unicode console support
|
||||
console::init(false, false);
|
||||
atexit([]() { console::cleanup(); });
|
||||
#endif
|
||||
|
||||
bool success = true;
|
||||
|
||||
for (const auto & test_kv : k_tests()) {
|
||||
|
||||
125
tests/test-tokenizer-1-llama.cpp
Normal file
125
tests/test-tokenizer-1-llama.cpp
Normal file
@@ -0,0 +1,125 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "console.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <codecvt>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <locale>
|
||||
|
||||
typedef int codepoint;
|
||||
|
||||
static std::string codepoint_to_utf8(codepoint cp) {
|
||||
std::string result;
|
||||
if (0x00 <= cp && cp <= 0x7f) {
|
||||
result.push_back(cp);
|
||||
} else if (0x80 <= cp && cp <= 0x7ff) {
|
||||
result.push_back(0xc0 | ((cp >> 6) & 0x1f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
} else if (0x800 <= cp && cp <= 0xffff) {
|
||||
result.push_back(0xe0 | ((cp >> 12) & 0x0f));
|
||||
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
} else if (0x10000 <= cp && cp <= 0x10ffff) {
|
||||
result.push_back(0xf0 | ((cp >> 18) & 0x07));
|
||||
result.push_back(0x80 | ((cp >> 12) & 0x3f));
|
||||
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
} else {
|
||||
throw std::invalid_argument("invalid codepoint");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const std::string fname = argv[1];
|
||||
|
||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
llama_backend_init(false);
|
||||
|
||||
// load the vocab
|
||||
{
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.vocab_only = true;
|
||||
|
||||
model = llama_load_model_from_file(fname.c_str(), lparams);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx = llama_new_context_with_model(model, lparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM);
|
||||
|
||||
#ifdef _WIN32
|
||||
// We need this for unicode console support
|
||||
console::init(false, false);
|
||||
atexit([]() { console::cleanup(); });
|
||||
#endif
|
||||
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
std::string str = llama_detokenize_spm(ctx, std::vector<int>(1, i));
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||
if (check != str) {
|
||||
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
|
||||
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
for (codepoint cp = 0x0000; cp < 0xffff; ++cp) {
|
||||
if (cp < 0xd800 || cp > 0xdfff) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||
if (cp != 9601 && str != check) {
|
||||
fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (codepoint cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||
if (str != check) {
|
||||
fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,108 +0,0 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <codecvt>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <locale>
|
||||
|
||||
static std::string escape_whitespace(const std::string& text) {
|
||||
std::string result = "\xe2\x96\x81";
|
||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
||||
if (text[offs] == ' ') {
|
||||
result += "\xe2\x96\x81";
|
||||
} else {
|
||||
result += text[offs];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const std::string fname = argv[1];
|
||||
|
||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
llama_backend_init(false);
|
||||
|
||||
// load the vocab
|
||||
{
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.vocab_only = true;
|
||||
|
||||
model = llama_load_model_from_file(fname.c_str(), lparams);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx = llama_new_context_with_model(model, lparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_BPE);
|
||||
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
std::string forward = llama_token_to_piece(ctx, i);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
|
||||
if (tokens.size() == 1) {
|
||||
if (i != tokens[0]) {
|
||||
std::string backward = llama_token_to_piece(ctx, tokens[0]);
|
||||
fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
|
||||
__func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str());
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
std::wstring_convert<typename std::codecvt_utf8<char16_t>, char16_t> u16converter;
|
||||
for (char16_t ch = 0x0000; ch < 0xffff; ++ch) {
|
||||
std::u16string u16str(1, ch);
|
||||
std::string str = u16converter.to_bytes(u16str);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
|
||||
if (tokens.size() == 1) {
|
||||
fprintf(stderr, "%s : info: %s tokenized to %d \n",
|
||||
__func__, str.c_str(), tokens[0]);
|
||||
}
|
||||
}
|
||||
|
||||
std::wstring_convert<typename std::codecvt_utf8<char32_t>, char32_t> u32converter;
|
||||
for (char32_t ch = 0x0000; ch < 0x0010ffff; ++ch) {
|
||||
std::u32string u32str(1, ch);
|
||||
std::string str = u32converter.to_bytes(u32str);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
|
||||
if (tokens.size() == 1) {
|
||||
fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user