mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-03-05 14:33:24 +02:00
Compare commits
18 Commits
master-db1
...
master-003
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
003ba2fb43 | ||
|
|
f9a6364912 | ||
|
|
95078cc554 | ||
|
|
1f48b0abcf | ||
|
|
e1295513a4 | ||
|
|
1b0fd45465 | ||
|
|
3924088512 | ||
|
|
173d0e6419 | ||
|
|
a3b85b28da | ||
|
|
921dcee00a | ||
|
|
2d13786e91 | ||
|
|
a90e96b266 | ||
|
|
94c5652fc0 | ||
|
|
34d9f22f44 | ||
|
|
d3e8093e9b | ||
|
|
360cfe5bec | ||
|
|
2edbdb0f99 | ||
|
|
20fbf2a2a0 |
150
.github/workflows/build.yml
vendored
150
.github/workflows/build.yml
vendored
@@ -120,7 +120,7 @@ jobs:
|
||||
make
|
||||
|
||||
macOS-latest-cmake:
|
||||
runs-on: macOS-latest
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -148,22 +148,64 @@ jobs:
|
||||
|
||||
windows-latest-cmake:
|
||||
runs-on: windows-latest
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
OPENCL_VERSION: 2023.04.17
|
||||
CLBLAST_VERSION: 1.5.3
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'avx2'
|
||||
defines: ''
|
||||
- build: 'avx'
|
||||
defines: '-DLLAMA_AVX2=OFF'
|
||||
- build: 'avx512'
|
||||
defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx2'
|
||||
defines: ''
|
||||
- build: 'avx'
|
||||
defines: '-DLLAMA_AVX2=OFF'
|
||||
- build: 'avx512'
|
||||
defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'clblast'
|
||||
defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
- build: 'openblas'
|
||||
defines: '-DLLAMA_OPENBLAS=ON -DBLAS_LIBRARIES="/LIBPATH:$env:RUNNER_TEMP/openblas/lib" -DOPENBLAS_INC="$env:RUNNER_TEMP/openblas/include"'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Download OpenCL SDK
|
||||
id: get_opencl
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
|
||||
mkdir $env:RUNNER_TEMP/opencl
|
||||
tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
|
||||
|
||||
- name: Download CLBlast
|
||||
id: get_clblast
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/clblast.zip -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-Windows-x64.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
|
||||
mkdir $env:RUNNER_TEMP/clblast
|
||||
tar.exe -xvf $env:RUNNER_TEMP/clblast.zip -C $env:RUNNER_TEMP/clblast
|
||||
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
|
||||
$txt = Get-Content -Path $f -Raw
|
||||
$txt.Replace('C:/dependencies/opencl/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
|
||||
}
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
|
||||
mkdir $env:RUNNER_TEMP/openblas
|
||||
tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
|
||||
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
|
||||
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
|
||||
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
|
||||
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
@@ -171,6 +213,21 @@ jobs:
|
||||
cd build
|
||||
cmake .. ${{ matrix.defines }}
|
||||
cmake --build . --config Release
|
||||
cp ../LICENSE ./bin/Release/llama.cpp.txt
|
||||
|
||||
- name: Add clblast.dll
|
||||
id: add_clblast_dll
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
|
||||
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
if: ${{ matrix.build == 'openblas' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
||||
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
||||
|
||||
- name: Check AVX512F support
|
||||
id: check_avx512f
|
||||
@@ -187,7 +244,7 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
if: ${{ matrix.build != 'avx512' || env.HAS_AVX512F == '1' }} # Test AVX-512 only when possible
|
||||
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
|
||||
run: |
|
||||
cd build
|
||||
ctest -C Release --verbose
|
||||
@@ -210,6 +267,82 @@ jobs:
|
||||
path: |
|
||||
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
|
||||
|
||||
windows-latest-cmake-cublas:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.1.0', '11.7.1']
|
||||
build: ['cublas']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- uses: Jimver/cuda-toolkit@v0.2.10
|
||||
id: cuda-toolkit
|
||||
with:
|
||||
cuda: ${{ matrix.cuda }}
|
||||
# TODO(green-sky): _dev seems to fail, and non dev are not enought
|
||||
#sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_CUBLAS=ON
|
||||
cmake --build . --config Release
|
||||
|
||||
- name: Get commit hash
|
||||
id: commit
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: pr-mpt/actions-commit-hash@v2
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
path: |
|
||||
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
if: ${{ matrix.cuda == '12.1.0' }}
|
||||
# TODO(green-sky): paths are cuda 12 specific
|
||||
run: |
|
||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
||||
mkdir '.\build\bin\cudart\'
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\'
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\'
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
|
||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
if: ${{ matrix.cuda == '11.7.1' }}
|
||||
# TODO(green-sky): paths are cuda 11 specific
|
||||
run: |
|
||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
||||
mkdir '.\build\bin\cudart\'
|
||||
ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
|
||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
path: |
|
||||
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
|
||||
@@ -221,6 +354,7 @@ jobs:
|
||||
- macOS-latest-make
|
||||
- macOS-latest-cmake
|
||||
- windows-latest-cmake
|
||||
- windows-latest-cmake-cublas
|
||||
|
||||
steps:
|
||||
- name: Download artifacts
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -21,6 +21,7 @@ build-sanitize-addr/
|
||||
build-sanitize-thread/
|
||||
|
||||
models/*
|
||||
*.bin
|
||||
|
||||
/main
|
||||
/quantize
|
||||
@@ -42,5 +43,6 @@ zig-out/
|
||||
zig-cache/
|
||||
|
||||
ppl-*.txt
|
||||
qnt-*.txt
|
||||
|
||||
examples/jeopardy/results.txt
|
||||
|
||||
13
Makefile
13
Makefile
@@ -107,7 +107,11 @@ ifndef LLAMA_NO_ACCELERATE
|
||||
endif
|
||||
ifdef LLAMA_OPENBLAS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
|
||||
LDFLAGS += -lopenblas
|
||||
ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
|
||||
LDFLAGS += -lopenblas -lcblas
|
||||
else
|
||||
LDFLAGS += -lopenblas
|
||||
endif
|
||||
endif
|
||||
ifdef LLAMA_CUBLAS
|
||||
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
@@ -121,7 +125,12 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
endif
|
||||
ifdef LLAMA_CLBLAST
|
||||
CFLAGS += -DGGML_USE_CLBLAST
|
||||
LDFLAGS += -lclblast -lOpenCL
|
||||
# Mac provides OpenCL as a framework
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
LDFLAGS += -lclblast -framework OpenCL
|
||||
else
|
||||
LDFLAGS += -lclblast -lOpenCL
|
||||
endif
|
||||
OBJS += ggml-opencl.o
|
||||
ggml-opencl.o: ggml-opencl.c ggml-opencl.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
42
README.md
42
README.md
@@ -18,10 +18,12 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
|
||||
|
||||
- Plain C/C++ implementation without dependencies
|
||||
- Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
|
||||
- AVX2 support for x86 architectures
|
||||
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||
- Mixed F16 / F32 precision
|
||||
- 4-bit integer quantization support
|
||||
- 4-bit, 5-bit and 8-bit integer quantization support
|
||||
- Runs on the CPU
|
||||
- OpenBLAS support
|
||||
- cuBLAS and CLBlast support
|
||||
|
||||
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
|
||||
Since then, the project has improved significantly thanks to many contributions. This project is for educational purposes and serves
|
||||
@@ -43,6 +45,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
||||
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
|
||||
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
|
||||
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
|
||||
|
||||
**Bindings:**
|
||||
|
||||
@@ -213,7 +216,6 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
```bash
|
||||
make LLAMA_OPENBLAS=1
|
||||
```
|
||||
Note: In order to build on Arch Linux with OpenBLAS support enabled you must edit the Makefile adding at the end of the line 105: `-lcblas`
|
||||
|
||||
- On Windows:
|
||||
|
||||
@@ -255,6 +257,8 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
|
||||
|
||||
### Prepare Data & Run
|
||||
|
||||
```bash
|
||||
@@ -294,17 +298,25 @@ Several quantization methods are supported. They differ in the resulting model d
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q4_2 | Q5_0 | Q5_1 | Q8_0 |
|
||||
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|-------:|
|
||||
| 7B | perplexity | 5.9565 | 6.2103 | 6.1286 | 6.1698 | 6.0139 | 5.9934 | 5.9571 |
|
||||
| 7B | perplexity | 5.9066 | 6.1620 | 6.0910 | 6.1466 | 5.9862 | 5.9481 | 5.9069 |
|
||||
| 7B | file size | 13.0G | 4.0G | 4.8G | 4.0G | 4.4G | 4.8G | 7.1G |
|
||||
| 7B | ms/tok @ 4th | 128 | 56 | 61 | 84 | 91 | 95 | 75 |
|
||||
| 7B | ms/tok @ 8th | 128 | 47 | 55 | 48 | 53 | 59 | 75 |
|
||||
| 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.0 | 5.5 | 6.0 | 9.0 |
|
||||
| 13B | perplexity | 5.2455 | 5.3748 | 5.3471 | 5.3433 | 5.2768 | 5.2582 | 5.2458 |
|
||||
| 13B | perplexity | 5.2543 | 5.3863 | 5.3607 | 5.3513 | 5.2856 | 5.2706 | 5.2548 |
|
||||
| 13B | file size | 25.0G | 7.6G | 9.1G | 7.6G | 8.4G | 9.1G | 14G |
|
||||
| 13B | ms/tok @ 4th | 239 | 104 | 113 | 160 | 176 | 185 | 141 |
|
||||
| 13B | ms/tok @ 8th | 240 | 85 | 99 | 97 | 108 | 117 | 147 |
|
||||
| 13B | bits/weight | 16.0 | 5.0 | 6.0 | 5.0 | 5.5 | 6.0 | 9.0 |
|
||||
|
||||
### Perplexity (measuring model quality)
|
||||
|
||||
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
|
||||
For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
|
||||
|
||||
The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
|
||||
The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 threads.
|
||||
|
||||
### Interactive mode
|
||||
|
||||
If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
|
||||
@@ -403,26 +415,6 @@ If your issue is with model generation quality, then please at least scan the fo
|
||||
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
|
||||
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
|
||||
|
||||
### Perplexity (measuring model quality)
|
||||
|
||||
You can use the `perplexity` example to measure perplexity over the given prompt. For more background, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity). However, in general, lower perplexity is better for LLMs.
|
||||
|
||||
#### Latest measurements
|
||||
|
||||
The latest perplexity scores for the various model sizes and quantizations are being tracked in [discussion #406](https://github.com/ggerganov/llama.cpp/discussions/406). `llama.cpp` is measuring very well compared to the baseline implementations. Quantization has a small negative impact on quality, but, as you can see, running
|
||||
13B at q4_0 beats the 7B f16 model by a significant amount.
|
||||
|
||||
All measurements are done against the wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context).
|
||||
Note that changing the context length will have a significant impact on perplexity (longer context = better perplexity).
|
||||
```
|
||||
Perplexity - model options
|
||||
5.5985 - 13B, q4_0
|
||||
5.9565 - 7B, f16
|
||||
6.3001 - 7B, q4_1
|
||||
6.5949 - 7B, q4_0
|
||||
6.5995 - 7B, q4_0, --memory_f16
|
||||
```
|
||||
|
||||
#### How to run
|
||||
|
||||
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
|
||||
28
convert.py
28
convert.py
@@ -67,6 +67,7 @@ FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \
|
||||
{ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()}
|
||||
|
||||
DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
|
||||
DT_BF16: np.dtype(np.uint16),
|
||||
DT_F16: np.dtype(np.float16),
|
||||
DT_F32: np.dtype(np.float32),
|
||||
DT_I32: np.dtype(np.int32),
|
||||
@@ -276,6 +277,12 @@ class Tensor(metaclass=ABCMeta):
|
||||
def to_ggml(self) -> 'GGMLCompatibleTensor': ...
|
||||
|
||||
|
||||
def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray:
|
||||
assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
|
||||
fp32_arr = bf16_arr.astype(np.uint32) << 16
|
||||
return fp32_arr.view(np.float32)
|
||||
|
||||
|
||||
class UnquantizedTensor(Tensor):
|
||||
def __init__(self, ndarray: NDArray) -> None:
|
||||
assert isinstance(ndarray, np.ndarray)
|
||||
@@ -284,6 +291,8 @@ class UnquantizedTensor(Tensor):
|
||||
|
||||
def astype(self, data_type: DataType) -> Tensor:
|
||||
dtype = DATA_TYPE_TO_NUMPY[data_type]
|
||||
if self.data_type == DT_BF16:
|
||||
self.ndarray = bf16_to_fp32(self.ndarray)
|
||||
return UnquantizedTensor(self.ndarray.astype(dtype))
|
||||
|
||||
def to_ggml(self) -> 'UnquantizedTensor':
|
||||
@@ -686,6 +695,7 @@ class LazyUnpickler(pickle.Unpickler):
|
||||
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
|
||||
return LazyStorage(load=load, kind=pid[1], description=description)
|
||||
|
||||
# @staticmethod
|
||||
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName]
|
||||
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
|
||||
assert isinstance(storage, LazyStorage)
|
||||
@@ -696,12 +706,18 @@ class LazyUnpickler(pickle.Unpickler):
|
||||
description = f'pickled storage_offset={storage_offset} in {storage.description}'
|
||||
return LazyTensor(load, list(size), storage.kind.data_type, description)
|
||||
|
||||
# @staticmethod
|
||||
def rebuild_from_type_v2(func, new_type, args, state):
|
||||
return func(*args)
|
||||
|
||||
CLASSES: Dict[Any, Any] = {
|
||||
('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2,
|
||||
('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2,
|
||||
('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
|
||||
('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
|
||||
('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
|
||||
('torch', 'IntStorage'): LazyStorageKind(DT_I32),
|
||||
('torch', 'Tensor'): LazyTensor,
|
||||
}
|
||||
|
||||
def find_class(self, module: str, name: str) -> Any:
|
||||
@@ -750,7 +766,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
|
||||
return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
|
||||
description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
|
||||
return LazyTensor(load, shape, data_type, description)
|
||||
model = {name: convert(info) for (name, info) in header.items()}
|
||||
model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
|
||||
return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
|
||||
|
||||
|
||||
@@ -961,7 +977,7 @@ class OutputFile:
|
||||
|
||||
def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
|
||||
wq_type = model["layers.0.attention.wq.weight"].data_type
|
||||
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
||||
if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
|
||||
return GGMLFileType.AllF32
|
||||
if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
|
||||
return GGMLFileType.MostlyF16
|
||||
@@ -1035,8 +1051,12 @@ def load_some_model(path: Path) -> ModelPlus:
|
||||
'''Load a model of any supported format.'''
|
||||
# Be extra-friendly and accept either a file or a directory:
|
||||
if path.is_dir():
|
||||
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt"]
|
||||
files = [file for glob in globs for file in path.glob(glob)]
|
||||
# Check if it's a set of safetensors files first
|
||||
files = list(path.glob("model-00001-of-*.safetensors"))
|
||||
if not files:
|
||||
# Try the PyTorch patterns too, with lower priority
|
||||
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt"]
|
||||
files = [file for glob in globs for file in path.glob(glob)]
|
||||
if not files:
|
||||
# Try GGML too, but with lower priority, since if both a non-GGML
|
||||
# model and a GGML model exist in the same directory, we assume the
|
||||
|
||||
@@ -100,6 +100,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
arg = argv[i];
|
||||
|
||||
if (arg == "-s" || arg == "--seed") {
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n");
|
||||
#endif
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
@@ -324,6 +327,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.input_prefix = argv[i];
|
||||
} else if (arg == "--in-suffix") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.input_suffix = argv[i];
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
@@ -362,6 +371,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
|
||||
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
|
||||
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
|
||||
fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
|
||||
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
||||
fprintf(stderr, " prompt file to start generation.\n");
|
||||
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
|
||||
@@ -428,8 +438,8 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
||||
// TODO: not great allocating this every time
|
||||
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
|
||||
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
|
||||
std::vector<llama_token> res(text.size() + (int)add_bos);
|
||||
int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
|
||||
std::vector<llama_token> res(text.size() + (int) add_bos);
|
||||
const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
|
||||
assert(n >= 0);
|
||||
res.resize(n);
|
||||
|
||||
|
||||
@@ -43,6 +43,7 @@ struct gpt_params {
|
||||
std::string prompt = "";
|
||||
std::string path_session = ""; // path to file for saving/loading model eval state
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
std::string input_suffix = ""; // string to suffix user inputs with
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
|
||||
std::string lora_adapter = ""; // lora adapter path
|
||||
|
||||
@@ -112,6 +112,14 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is
|
||||
./main -r "User:" --in-prefix " "
|
||||
```
|
||||
|
||||
### In-Suffix
|
||||
|
||||
The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
|
||||
|
||||
```sh
|
||||
./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
|
||||
```
|
||||
|
||||
### Instruction Mode
|
||||
|
||||
Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks:
|
||||
|
||||
@@ -260,6 +260,10 @@ int main(int argc, char ** argv) {
|
||||
if (!params.input_prefix.empty()) {
|
||||
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
}
|
||||
|
||||
if (!params.input_suffix.empty()) {
|
||||
fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
||||
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
||||
@@ -309,7 +313,8 @@ int main(int argc, char ** argv) {
|
||||
if (n_past + (int) embd.size() > n_ctx) {
|
||||
const int n_left = n_past - params.n_keep;
|
||||
|
||||
n_past = params.n_keep;
|
||||
// always keep the first token - BOS
|
||||
n_past = std::max(1, params.n_keep);
|
||||
|
||||
// insert n_left/2 tokens at the start of embd from last_n_tokens
|
||||
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
|
||||
@@ -327,7 +332,6 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
|
||||
// REVIEW
|
||||
if (n_session_consumed < (int) session_tokens.size()) {
|
||||
size_t i = 0;
|
||||
for ( ; i < embd.size(); i++) {
|
||||
@@ -440,10 +444,10 @@ int main(int argc, char ** argv) {
|
||||
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||
} else {
|
||||
// Temperature sampling
|
||||
llama_sample_top_k(ctx, &candidates_p, top_k);
|
||||
llama_sample_tail_free(ctx, &candidates_p, tfs_z);
|
||||
llama_sample_typical(ctx, &candidates_p, typical_p);
|
||||
llama_sample_top_p(ctx, &candidates_p, top_p);
|
||||
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
|
||||
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
|
||||
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
|
||||
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token(ctx, &candidates_p);
|
||||
}
|
||||
@@ -567,6 +571,11 @@ int main(int argc, char ** argv) {
|
||||
// Add tokens to embd only if the input buffer is non-empty
|
||||
// Entering a empty line lets the user pass control back
|
||||
if (buffer.length() > 1) {
|
||||
// append input suffix if any
|
||||
if (!params.input_suffix.empty()) {
|
||||
buffer += params.input_suffix;
|
||||
printf("%s", params.input_suffix.c_str());
|
||||
}
|
||||
|
||||
// instruct mode: insert instruction prefix
|
||||
if (params.instruct && !is_antiprompt) {
|
||||
|
||||
@@ -25,46 +25,68 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
// Output: `perplexity: 13.5106 [114/114]`
|
||||
// BOS tokens will be added for each chunk before eval
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
int count = 0;
|
||||
int seq_count = tokens.size() / params.n_ctx;
|
||||
int n_vocab = llama_n_vocab(ctx);
|
||||
int count = 0;
|
||||
|
||||
const int n_chunk = tokens.size() / params.n_ctx;
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
const int n_batch = params.n_batch;
|
||||
|
||||
double nll = 0.0;
|
||||
fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch);
|
||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||
|
||||
for (int i = 0; i < seq_count; ++i) {
|
||||
int start = i * params.n_ctx;
|
||||
int end = start + params.n_ctx;
|
||||
for (int i = 0; i < n_chunk; ++i) {
|
||||
const int start = i * params.n_ctx;
|
||||
const int end = start + params.n_ctx;
|
||||
|
||||
const int num_batches = (params.n_ctx + n_batch - 1) / n_batch;
|
||||
|
||||
std::vector<float> logits;
|
||||
int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch;
|
||||
auto start_t = std::chrono::high_resolution_clock::now();
|
||||
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
int batch_start = start + j * params.n_batch;
|
||||
int batch_size = std::min(end - batch_start, params.n_batch);
|
||||
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
const int batch_size = std::min(end - batch_start, n_batch);
|
||||
|
||||
// save original token and restore it after eval
|
||||
const auto token_org = tokens[batch_start];
|
||||
|
||||
// add BOS token for the first batch of each chunk
|
||||
if (j == 0) {
|
||||
tokens[batch_start] = llama_token_bos();
|
||||
}
|
||||
|
||||
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return;
|
||||
}
|
||||
auto batch_logits = llama_get_logits(ctx);
|
||||
|
||||
// restore the original token in case it was set to BOS
|
||||
tokens[batch_start] = token_org;
|
||||
|
||||
const auto batch_logits = llama_get_logits(ctx);
|
||||
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
||||
}
|
||||
auto end_t = std::chrono::high_resolution_clock::now();
|
||||
|
||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
if (i == 0) {
|
||||
const float seconds = std::chrono::duration<float>(end_t - start_t).count();
|
||||
printf("%.2f seconds per pass - ETA ", seconds);
|
||||
int total_seconds = (int)(seconds * seq_count);
|
||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||
int total_seconds = (int)(t_total * n_chunk);
|
||||
if (total_seconds >= 60*60) {
|
||||
printf("%d hours ", total_seconds / (60*60));
|
||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
||||
total_seconds = total_seconds % (60*60);
|
||||
}
|
||||
printf("%d minutes\n", total_seconds / 60);
|
||||
fprintf(stderr, "%d minutes\n", total_seconds / 60);
|
||||
}
|
||||
|
||||
// We get the logits for all the tokens in the context window (params.n_ctx)
|
||||
// from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
|
||||
// calculate the perplexity over the last half the window (so the model always has
|
||||
// calculate the perplexity over the last half of the window (so the model always has
|
||||
// some context to predict the token).
|
||||
//
|
||||
// We rely on the fact that attention in the forward pass only looks at previous
|
||||
@@ -76,10 +98,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
// process the entire prompt.
|
||||
for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
|
||||
// Calculate probability of next token, given the previous ones.
|
||||
std::vector<float> tok_logits(
|
||||
logits.begin() + j * n_vocab,
|
||||
const std::vector<float> tok_logits(
|
||||
logits.begin() + (j + 0) * n_vocab,
|
||||
logits.begin() + (j + 1) * n_vocab);
|
||||
float prob = softmax(tok_logits)[tokens[start + j + 1]];
|
||||
|
||||
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
|
||||
|
||||
nll += -std::log(prob);
|
||||
++count;
|
||||
}
|
||||
|
||||
@@ -6,23 +6,47 @@
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
|
||||
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
|
||||
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
|
||||
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
|
||||
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
|
||||
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
|
||||
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
|
||||
static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
|
||||
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
|
||||
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
|
||||
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
|
||||
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
|
||||
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
|
||||
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
|
||||
};
|
||||
|
||||
bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||
auto it = LLAMA_FTYPE_MAP.find(ftype_str);
|
||||
if (it != LLAMA_FTYPE_MAP.end()) {
|
||||
ftype = it->second;
|
||||
ftype_str_out = it->first;
|
||||
return true;
|
||||
}
|
||||
// try to parse as an integer
|
||||
try {
|
||||
int ftype_int = std::stoi(ftype_str);
|
||||
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
|
||||
if (it->second == ftype_int) {
|
||||
ftype = it->second;
|
||||
ftype_str_out = it->first;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (...) {
|
||||
// stoi failed
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// usage:
|
||||
// ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
|
||||
// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
|
||||
//
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
if (argc < 4) {
|
||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
|
||||
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
|
||||
fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
|
||||
}
|
||||
@@ -36,24 +60,62 @@ int main(int argc, char ** argv) {
|
||||
ggml_free(ctx);
|
||||
}
|
||||
|
||||
// parse command line arguments
|
||||
const std::string fname_inp = argv[1];
|
||||
const std::string fname_out = argv[2];
|
||||
std::string fname_out;
|
||||
int nthread;
|
||||
llama_ftype ftype;
|
||||
|
||||
enum llama_ftype ftype;
|
||||
if (argv[3][0] == 'q') {
|
||||
auto it = LLAMA_FTYPE_MAP.find(argv[3]);
|
||||
if (it == LLAMA_FTYPE_MAP.end()) {
|
||||
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
|
||||
int arg_idx = 2;
|
||||
std::string ftype_str;
|
||||
if (try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
|
||||
// argv[2] is the ftype
|
||||
std::string fpath;
|
||||
const size_t pos = fname_inp.find_last_of('/');
|
||||
if (pos != std::string::npos) {
|
||||
fpath = fname_inp.substr(0, pos + 1);
|
||||
}
|
||||
// export as [inp path]/ggml-model-[ftype].bin
|
||||
fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
|
||||
arg_idx++;
|
||||
}
|
||||
else {
|
||||
// argv[2] is the output path
|
||||
fname_out = argv[arg_idx];
|
||||
arg_idx++;
|
||||
|
||||
if (argc <= arg_idx) {
|
||||
fprintf(stderr, "%s: missing ftype\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
// argv[3] is the ftype
|
||||
if (!try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
|
||||
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
|
||||
return 1;
|
||||
}
|
||||
arg_idx++;
|
||||
}
|
||||
|
||||
// parse nthreads
|
||||
if (argc > arg_idx) {
|
||||
try {
|
||||
nthread = std::stoi(argv[arg_idx]);
|
||||
}
|
||||
catch (const std::exception & e) {
|
||||
fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
|
||||
return 1;
|
||||
}
|
||||
ftype = it->second;
|
||||
} else {
|
||||
ftype = (enum llama_ftype)atoi(argv[3]);
|
||||
nthread = 0;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
|
||||
int nthread = argc > 4 ? atoi(argv[4]) : 0;
|
||||
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
|
||||
if (nthread > 0) {
|
||||
fprintf(stderr, " using %d threads", nthread);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
|
||||
|
||||
@@ -348,7 +348,7 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
||||
CUDA_CHECK(cudaFree(ptr));
|
||||
}
|
||||
|
||||
#define GGML_CUDA_MAX_STREAMS 8
|
||||
#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
|
||||
#define GGML_CUDA_MAX_EVENTS 64
|
||||
static cublasHandle_t g_cublasH = nullptr;
|
||||
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
|
||||
|
||||
7
ggml.c
7
ggml.c
@@ -137,6 +137,9 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
#include <Accelerate/Accelerate.h>
|
||||
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
||||
#include "ggml-opencl.h"
|
||||
#endif
|
||||
#elif defined(GGML_USE_OPENBLAS)
|
||||
#include <cblas.h>
|
||||
#elif defined(GGML_USE_CUBLAS)
|
||||
@@ -180,9 +183,13 @@ typedef double ggml_float;
|
||||
#undef bool
|
||||
#define bool _Bool
|
||||
#else
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __F16C__
|
||||
|
||||
|
||||
17
llama-util.h
17
llama-util.h
@@ -14,6 +14,7 @@
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
|
||||
#ifdef __has_include
|
||||
#if __has_include(<unistd.h>)
|
||||
@@ -74,7 +75,7 @@ struct llama_file {
|
||||
llama_file(const char * fname, const char * mode) {
|
||||
fp = std::fopen(fname, mode);
|
||||
if (fp == NULL) {
|
||||
throw format("failed to open %s: %s", fname, std::strerror(errno));
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||
}
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
@@ -107,10 +108,10 @@ struct llama_file {
|
||||
errno = 0;
|
||||
std::size_t ret = std::fread(ptr, size, 1, fp);
|
||||
if (ferror(fp)) {
|
||||
throw format("read error: %s", strerror(errno));
|
||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||
}
|
||||
if (ret != 1) {
|
||||
throw std::string("unexpectedly reached end of file");
|
||||
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -133,7 +134,7 @@ struct llama_file {
|
||||
errno = 0;
|
||||
size_t ret = std::fwrite(ptr, size, 1, fp);
|
||||
if (ret != 1) {
|
||||
throw format("write error: %s", strerror(errno));
|
||||
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,7 +181,7 @@ struct llama_mmap {
|
||||
#endif
|
||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
||||
if (addr == MAP_FAILED) {
|
||||
throw format("mmap failed: %s", strerror(errno));
|
||||
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
||||
}
|
||||
|
||||
if (prefetch) {
|
||||
@@ -207,7 +208,7 @@ struct llama_mmap {
|
||||
DWORD error = GetLastError();
|
||||
|
||||
if (hMapping == NULL) {
|
||||
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
||||
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
||||
@@ -215,7 +216,7 @@ struct llama_mmap {
|
||||
CloseHandle(hMapping);
|
||||
|
||||
if (addr == NULL) {
|
||||
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
||||
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||
@@ -245,7 +246,7 @@ struct llama_mmap {
|
||||
|
||||
llama_mmap(struct llama_file *, bool prefetch = true) {
|
||||
(void)prefetch;
|
||||
throw std::string("mmap not supported");
|
||||
throw std::runtime_error(std::string("mmap not supported"));
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
16
llama.cpp
16
llama.cpp
@@ -970,8 +970,6 @@ static void llama_model_load_internal(
|
||||
|
||||
// prepare memory for the weights
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
const uint32_t n_vocab = hparams.n_vocab;
|
||||
@@ -1052,6 +1050,13 @@ static bool llama_eval_internal(
|
||||
const int n_tokens,
|
||||
const int n_past,
|
||||
const int n_threads) {
|
||||
|
||||
// enforce that the first token is BOS
|
||||
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
||||
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
const int N = n_tokens;
|
||||
@@ -1482,7 +1487,7 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
||||
}
|
||||
|
||||
if (bos) {
|
||||
output.push_back(1);
|
||||
output.push_back(llama_token_bos());
|
||||
}
|
||||
|
||||
tokenizer.tokenize(text, output);
|
||||
@@ -1791,7 +1796,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
||||
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
|
||||
|
||||
// Sample the next word X using top-k sampling
|
||||
llama_sample_top_k(nullptr, candidates, int(k));
|
||||
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
@@ -2727,11 +2732,14 @@ int llama_eval(
|
||||
fprintf(stderr, "%s: failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// get a more accurate load time, upon first eval
|
||||
// TODO: fix this
|
||||
if (!ctx->has_evaluated_once) {
|
||||
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
||||
ctx->has_evaluated_once = true;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
8
llama.h
8
llama.h
@@ -202,16 +202,16 @@ extern "C" {
|
||||
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||
|
||||
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
|
||||
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
|
||||
|
||||
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
|
||||
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
||||
|
||||
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
||||
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
|
||||
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
|
||||
|
||||
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
||||
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
|
||||
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
||||
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
||||
|
||||
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
||||
|
||||
43
scripts/ppl-run-all.sh
Executable file
43
scripts/ppl-run-all.sh
Executable file
@@ -0,0 +1,43 @@
|
||||
#!/bin/bash
|
||||
|
||||
#
|
||||
# quantize
|
||||
#
|
||||
|
||||
# 7B
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_2.bin q4_2 2>&1 | tee ../qnt-7b-q4_2.txt
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt
|
||||
|
||||
# 13B
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_2.bin q4_2 2>&1 | tee ../qnt-13b-q4_2.txt
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt
|
||||
|
||||
#
|
||||
# perplexity
|
||||
#
|
||||
|
||||
# 7B
|
||||
time ./bin/perplexity -m ../models/7B/ggml-model-f16.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-f16.txt
|
||||
time ./bin/perplexity -m ../models/7B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_0.txt
|
||||
time ./bin/perplexity -m ../models/7B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_1.txt
|
||||
time ./bin/perplexity -m ../models/7B/ggml-model-q4_2.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_2.txt
|
||||
time ./bin/perplexity -m ../models/7B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_0.txt
|
||||
time ./bin/perplexity -m ../models/7B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_1.txt
|
||||
time ./bin/perplexity -m ../models/7B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q8_0.txt
|
||||
|
||||
# 13B
|
||||
time ./bin/perplexity -m ../models/13B/ggml-model-f16.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-f16.txt
|
||||
time ./bin/perplexity -m ../models/13B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_0.txt
|
||||
time ./bin/perplexity -m ../models/13B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_1.txt
|
||||
time ./bin/perplexity -m ../models/13B/ggml-model-q4_2.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_2.txt
|
||||
time ./bin/perplexity -m ../models/13B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_0.txt
|
||||
time ./bin/perplexity -m ../models/13B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_1.txt
|
||||
time ./bin/perplexity -m ../models/13B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q8_0.txt
|
||||
@@ -32,7 +32,7 @@ void test_top_k(const std::vector<float> & probs,
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
DUMP(&candidates_p);
|
||||
llama_sample_top_k(nullptr, &candidates_p, k);
|
||||
llama_sample_top_k(nullptr, &candidates_p, k, 1);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
@@ -57,7 +57,7 @@ void test_top_p(const std::vector<float> & probs,
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
DUMP(&candidates_p);
|
||||
llama_sample_top_p(nullptr, &candidates_p, p);
|
||||
llama_sample_top_p(nullptr, &candidates_p, p, 1);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
@@ -80,7 +80,7 @@ void test_tfs(const std::vector<float> & probs,
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
DUMP(&candidates_p);
|
||||
llama_sample_tail_free(nullptr, &candidates_p, z);
|
||||
llama_sample_tail_free(nullptr, &candidates_p, z, 1);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
@@ -103,7 +103,7 @@ void test_typical(const std::vector<float> & probs,
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
DUMP(&candidates_p);
|
||||
llama_sample_typical(nullptr, &candidates_p, p);
|
||||
llama_sample_typical(nullptr, &candidates_p, p, 1);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
|
||||
Reference in New Issue
Block a user