mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
27 Commits
master-63d
...
master-fab
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fab49c685e | ||
|
|
b8ee340abe | ||
|
|
9ecb30f959 | ||
|
|
29cf5596fe | ||
|
|
3de84b2606 | ||
|
|
affc76edfd | ||
|
|
ea600071cb | ||
|
|
07e9ace0f9 | ||
|
|
ec2e10c444 | ||
|
|
d2c59b8ba4 | ||
|
|
503db28849 | ||
|
|
8a203f9fa1 | ||
|
|
4fd3e29297 | ||
|
|
2d5db48371 | ||
|
|
6986c7835a | ||
|
|
943e6081cc | ||
|
|
7694b52b9a | ||
|
|
79e3efb0e9 | ||
|
|
4b7e245adf | ||
|
|
5ea4339273 | ||
|
|
ee9654138a | ||
|
|
dc271c52ed | ||
|
|
c238b5873a | ||
|
|
2b2646931b | ||
|
|
42627421ec | ||
|
|
9560655409 | ||
|
|
2a5ee023ad |
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@@ -165,7 +165,7 @@ jobs:
|
||||
- build: 'clblast'
|
||||
defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
- build: 'openblas'
|
||||
defines: '-DLLAMA_OPENBLAS=ON -DBLAS_LIBRARIES="/LIBPATH:$env:RUNNER_TEMP/openblas/lib" -DOPENBLAS_INC="$env:RUNNER_TEMP/openblas/include"'
|
||||
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
67
BLIS.md
Normal file
67
BLIS.md
Normal file
@@ -0,0 +1,67 @@
|
||||
BLIS Installation Manual
|
||||
------------------------
|
||||
|
||||
BLIS is a portable software framework for high-performance BLAS-like dense linear algebra libraries. It has received awards and recognition, including the 2023 James H. Wilkinson Prize for Numerical Software and the 2020 SIAM Activity Group on Supercomputing Best Paper Prize. BLIS provides a new BLAS-like API and a compatibility layer for traditional BLAS routine calls. It offers features such as object-based API, typed API, BLAS and CBLAS compatibility layers.
|
||||
|
||||
Project URL: https://github.com/flame/blis
|
||||
|
||||
### Prepare:
|
||||
|
||||
Compile BLIS:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/flame/blis
|
||||
cd blis
|
||||
./configure --enable-cblas -t openmp,pthreads auto
|
||||
# will install to /usr/local/ by default.
|
||||
make -j
|
||||
```
|
||||
|
||||
Install BLIS:
|
||||
|
||||
```bash
|
||||
sudo make install
|
||||
```
|
||||
|
||||
We recommend using openmp since it's easier to modify the cores been used.
|
||||
|
||||
### llama.cpp compilation
|
||||
|
||||
Makefile:
|
||||
|
||||
```bash
|
||||
make LLAMA_BLIS=1 -j
|
||||
# make LLAMA_BLIS=1 benchmark-matmult
|
||||
```
|
||||
|
||||
CMake:
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
|
||||
make -j
|
||||
```
|
||||
|
||||
### llama.cpp execution
|
||||
|
||||
According to the BLIS documentation, we could set the following
|
||||
environment variables to modify the behavior of openmp:
|
||||
|
||||
```
|
||||
export GOMP_GPU_AFFINITY="0-19"
|
||||
export BLIS_NUM_THREADS=14
|
||||
```
|
||||
|
||||
And then run the binaries as normal.
|
||||
|
||||
|
||||
### Intel specific issue
|
||||
|
||||
Some might get the error message saying that `libimf.so` cannot be found.
|
||||
Please follow this [stackoverflow page](https://stackoverflow.com/questions/70687930/intel-oneapi-2022-libimf-so-no-such-file-or-directory-during-openmpi-compila).
|
||||
|
||||
### Reference:
|
||||
|
||||
1. https://github.com/flame/blis#getting-started
|
||||
2. https://github.com/flame/blis/blob/master/docs/Multithreading.md
|
||||
@@ -65,7 +65,8 @@ endif()
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
|
||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
||||
option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic)
|
||||
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
|
||||
@@ -145,36 +146,28 @@ if (APPLE AND LLAMA_ACCELERATE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_OPENBLAS)
|
||||
if (LLAMA_BLAS)
|
||||
if (LLAMA_STATIC)
|
||||
set(BLA_STATIC ON)
|
||||
endif()
|
||||
|
||||
set(BLA_VENDOR OpenBLAS)
|
||||
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
|
||||
set(BLA_SIZEOF_INTEGER 8)
|
||||
endif()
|
||||
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
|
||||
find_package(BLAS)
|
||||
if (BLAS_FOUND)
|
||||
message(STATUS "OpenBLAS found")
|
||||
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
|
||||
|
||||
add_compile_options(${BLAS_LINKER_FLAGS})
|
||||
add_compile_definitions(GGML_USE_OPENBLAS)
|
||||
add_link_options(${BLAS_LIBRARIES})
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
||||
|
||||
# find header file
|
||||
set(OPENBLAS_INCLUDE_SEARCH_PATHS
|
||||
/usr/include
|
||||
/usr/include/openblas
|
||||
/usr/include/openblas-base
|
||||
/usr/local/include
|
||||
/usr/local/include/openblas
|
||||
/usr/local/include/openblas-base
|
||||
/opt/OpenBLAS/include
|
||||
$ENV{OpenBLAS_HOME}
|
||||
$ENV{OpenBLAS_HOME}/include
|
||||
)
|
||||
find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
|
||||
add_compile_options(-I${OPENBLAS_INC})
|
||||
message("${BLAS_LIBRARIES} ${BLAS_INCLUDE_DIRS}")
|
||||
include_directories(${BLAS_INCLUDE_DIRS})
|
||||
else()
|
||||
message(WARNING "OpenBLAS not found")
|
||||
message(WARNING "BLAS not found, please refer to "
|
||||
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
||||
" to set correct LLAMA_BLAS_VENDOR")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
6
Makefile
6
Makefile
@@ -115,13 +115,17 @@ ifndef LLAMA_NO_ACCELERATE
|
||||
endif
|
||||
endif
|
||||
ifdef LLAMA_OPENBLAS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
|
||||
ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
|
||||
LDFLAGS += -lopenblas -lcblas
|
||||
else
|
||||
LDFLAGS += -lopenblas
|
||||
endif
|
||||
endif
|
||||
ifdef LLAMA_BLIS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||
LDFLAGS += -lblis -L/usr/local/lib
|
||||
endif
|
||||
ifdef LLAMA_CUBLAS
|
||||
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
|
||||
41
README.md
41
README.md
@@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
**Hot topics:**
|
||||
|
||||
- Quantization formats `Q4` and `Q8` have changed again (19 May) - [(info)](https://github.com/ggerganov/llama.cpp/pull/1508)
|
||||
- Quantization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405)
|
||||
- [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)
|
||||
|
||||
@@ -55,7 +56,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
|
||||
- Mixed F16 / F32 precision
|
||||
- 4-bit, 5-bit and 8-bit integer quantization support
|
||||
- Runs on the CPU
|
||||
- OpenBLAS support
|
||||
- Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
|
||||
- cuBLAS and CLBlast support
|
||||
|
||||
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
|
||||
@@ -80,6 +81,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
|
||||
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
|
||||
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
|
||||
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
|
||||
|
||||
**Bindings:**
|
||||
|
||||
@@ -272,10 +274,25 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_OPENBLAS=ON
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
- BLIS
|
||||
|
||||
Check [BLIS.md](BLIS.md) for more information.
|
||||
|
||||
- Intel MKL
|
||||
|
||||
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake --build . -config Release
|
||||
```
|
||||
|
||||
- cuBLAS
|
||||
|
||||
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
|
||||
@@ -333,16 +350,16 @@ Several quantization methods are supported. They differ in the resulting model d
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
||||
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
|
||||
| 7B | perplexity | 5.9066 | 6.1565 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
|
||||
| 7B | file size | 13.0G | 4.0G | 4.8G | 4.4G | 4.8G | 7.1G |
|
||||
| 7B | ms/tok @ 4th | 128 | 50 | 54 | 75 | 83 | 75 |
|
||||
| 7B | ms/tok @ 8th | 123 | 44 | 52 | 53 | 58 | 72 |
|
||||
| 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 |
|
||||
| 13B | perplexity | 5.2543 | 5.3860 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
|
||||
| 13B | file size | 25.0G | 7.6G | 9.1G | 8.4G | 9.1G | 14G |
|
||||
| 13B | ms/tok @ 4th | 239 | 93 | 101 | 150 | 164 | 141 |
|
||||
| 13B | ms/tok @ 8th | 240 | 81 | 96 | 96 | 104 | 136 |
|
||||
| 13B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 |
|
||||
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
|
||||
| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |
|
||||
| 7B | ms/tok @ 4th | 127 | 55 | 54 | 76 | 83 | 72 |
|
||||
| 7B | ms/tok @ 8th | 122 | 43 | 45 | 52 | 56 | 67 |
|
||||
| 7B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
|
||||
| 13B | perplexity | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 |
|
||||
| 13B | file size | 25.0G | 6.8G | 7.6G | 8.3G | 9.1G | 13G |
|
||||
| 13B | ms/tok @ 4th | - | 103 | 105 | 148 | 160 | 131 |
|
||||
| 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 |
|
||||
| 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
|
||||
|
||||
### Perplexity (measuring model quality)
|
||||
|
||||
|
||||
@@ -121,7 +121,6 @@ def make_tensors_list() -> List[str]:
|
||||
f'layers.{i}.feed_forward.w1.weight',
|
||||
f'layers.{i}.feed_forward.w2.weight',
|
||||
f'layers.{i}.feed_forward.w3.weight',
|
||||
f'layers.{i}.atttention_norm.weight',
|
||||
f'layers.{i}.ffn_norm.weight',
|
||||
]
|
||||
return ret
|
||||
@@ -1055,7 +1054,7 @@ def load_some_model(path: Path) -> ModelPlus:
|
||||
files = list(path.glob("model-00001-of-*.safetensors"))
|
||||
if not files:
|
||||
# Try the PyTorch patterns too, with lower priority
|
||||
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt"]
|
||||
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin" ]
|
||||
files = [file for glob in globs for file in path.glob(glob)]
|
||||
if not files:
|
||||
# Try GGML too, but with lower priority, since if both a non-GGML
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include <locale.h>
|
||||
#include "ggml.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <locale.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <cstring>
|
||||
@@ -211,6 +212,7 @@ int main(int argc, char ** argv) {
|
||||
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
||||
printf("=====================================================================================\n");
|
||||
|
||||
double gflops_sum = 0;
|
||||
for (int i=0;i<benchmark_params.n_iterations ;i++) {
|
||||
|
||||
long long int start = ggml_time_us();
|
||||
@@ -219,6 +221,7 @@ int main(int argc, char ** argv) {
|
||||
long long int stop = ggml_time_us();
|
||||
long long int usec = stop-start;
|
||||
double gflops = (double)(flops_per_matrix)/usec/1000.0;
|
||||
gflops_sum += gflops;
|
||||
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
|
||||
i,
|
||||
gf31.n_threads,
|
||||
@@ -248,4 +251,7 @@ int main(int argc, char ** argv) {
|
||||
// Running a different graph computation to make sure we override the CPU cache lines
|
||||
ggml_graph_compute(ctx, &gf32);
|
||||
}
|
||||
printf("\n");
|
||||
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
|
||||
printf("=====================================================================================\n");
|
||||
}
|
||||
|
||||
151
examples/chat-persistent.sh
Executable file
151
examples/chat-persistent.sh
Executable file
@@ -0,0 +1,151 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "$(dirname "$0")/.." || exit
|
||||
|
||||
if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
|
||||
echo >&2 "error: PROMPT_CACHE_FILE and CHAT_SAVE_DIR must be provided"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
|
||||
PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
|
||||
USER_NAME="${USER_NAME:-User}"
|
||||
AI_NAME="${AI_NAME:-ChatLLaMa}"
|
||||
DATE_TIME="$(date +%H:%M)"
|
||||
DATE_YEAR="$(date +%Y)"
|
||||
|
||||
LOG="${CHAT_SAVE_DIR}/main.log"
|
||||
LOG_BG="${CHAT_SAVE_DIR}/main-bg.log"
|
||||
CUR_PROMPT_FILE="${CHAT_SAVE_DIR}/current-prompt.txt"
|
||||
CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
|
||||
NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
|
||||
NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
|
||||
|
||||
SESSION_SIZE_MSG_PATTERN='main: session file matches \d+ / \d+'
|
||||
SAMPLE_TIME_MSG_PATTERN='sample time =\s+\d+.\d+ ms /\s+\d+'
|
||||
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
|
||||
|
||||
CTX_SIZE=2048
|
||||
CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
|
||||
OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
|
||||
|
||||
# An unbuffered `tail -c+N`
|
||||
skip_bytes() {
|
||||
LANG=C IFS= read -r -n "$1" -d '' c
|
||||
while LANG=C IFS= read -r -n 1 -d '' c; do
|
||||
printf '%s' "$c"
|
||||
done
|
||||
}
|
||||
|
||||
mkdir -p "$CHAT_SAVE_DIR"
|
||||
echo >"$LOG"
|
||||
trap "tail -n100 ${LOG}" EXIT
|
||||
|
||||
if [[ ! -e "$CUR_PROMPT_FILE" ]]; then
|
||||
sed -e "s/\[\[USER_NAME\]\]/${USER_NAME}/g" \
|
||||
-e "s/\[\[AI_NAME\]\]/${AI_NAME}/g" \
|
||||
-e "s/\[\[DATE_TIME\]\]/${DATE_TIME}/g" \
|
||||
-e "s/\[\[DATE_YEAR\]\]/${DATE_YEAR}/g" \
|
||||
"$PROMPT_TEMPLATE" >"$CUR_PROMPT_FILE"
|
||||
fi
|
||||
|
||||
if [[ ! -e "$NEXT_PROMPT_FILE" ]]; then
|
||||
sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
|
||||
fi
|
||||
|
||||
if [[ "$(tail -c4 "$NEXT_PROMPT_FILE")" != "..." ]]; then
|
||||
echo '...' >>"$NEXT_PROMPT_FILE"
|
||||
fi
|
||||
|
||||
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
||||
echo 'Prompt cache does not exist, building...'
|
||||
# Default batch_size to 8 here for better user feedback during initial prompt processing
|
||||
./main 2>>"$LOG" \
|
||||
--batch_size 8 \
|
||||
"${OPTS[@]}" \
|
||||
--prompt-cache "$PROMPT_CACHE_FILE" \
|
||||
--file "$CUR_PROMPT_FILE" \
|
||||
--n_predict 1
|
||||
echo
|
||||
echo 'Done!'
|
||||
fi
|
||||
|
||||
if [[ ! -e "$CUR_PROMPT_CACHE" ]]; then
|
||||
cp "$PROMPT_CACHE_FILE" "$CUR_PROMPT_CACHE"
|
||||
fi
|
||||
if [[ ! -e "$NEXT_PROMPT_CACHE" ]]; then
|
||||
cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
|
||||
fi
|
||||
|
||||
printf '%s ' "$(< "$CUR_PROMPT_FILE")"
|
||||
n_tokens=0
|
||||
|
||||
while read -e line; do
|
||||
# Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
|
||||
n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
|
||||
|
||||
# Swap prompts when we're about to run out of context
|
||||
if ((n_predict <= 0)); then
|
||||
wait # for background main (below) to finish with next prompt
|
||||
mv "$NEXT_PROMPT_FILE" "$CUR_PROMPT_FILE"
|
||||
mv "$NEXT_PROMPT_CACHE" "$CUR_PROMPT_CACHE"
|
||||
|
||||
sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
|
||||
echo '...' >>"$NEXT_PROMPT_FILE"
|
||||
cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
|
||||
|
||||
n_tokens=0
|
||||
n_predict=$((CTX_SIZE / 2))
|
||||
fi
|
||||
|
||||
echo " ${line}" >>"$CUR_PROMPT_FILE"
|
||||
if ((n_tokens > CTX_ROTATE_POINT)); then
|
||||
echo " ${line}" >>"$NEXT_PROMPT_FILE"
|
||||
fi
|
||||
|
||||
n_prompt_len_pre=$(($(wc -c <"$CUR_PROMPT_FILE")))
|
||||
|
||||
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
|
||||
|
||||
./main 2>>"$LOG" "${OPTS[@]}" \
|
||||
--prompt-cache "$CUR_PROMPT_CACHE" \
|
||||
--prompt-cache-all \
|
||||
--file "$CUR_PROMPT_FILE" \
|
||||
--reverse-prompt "${USER_NAME}:" \
|
||||
--n_predict "$n_predict" |
|
||||
skip_bytes 1 | # skip BOS token added by ./main
|
||||
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
|
||||
skip_bytes "$n_prompt_len_pre" # print generation
|
||||
|
||||
mv "$CUR_PROMPT_FILE.tmp" "$CUR_PROMPT_FILE"
|
||||
|
||||
# if we hit n_predict instead of reverse-prompt, we need to add the prompt
|
||||
if [[ "$(tail -n1 "$CUR_PROMPT_FILE")" != "${USER_NAME}:" ]]; then
|
||||
printf '\n%s:' "$USER_NAME"
|
||||
printf '\n%s:' "$USER_NAME" >> "$CUR_PROMPT_FILE"
|
||||
fi
|
||||
|
||||
printf ' '
|
||||
|
||||
# HACK get num tokens from debug message
|
||||
# TODO get both messages in one go
|
||||
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
||||
! sample_time_msg="$( tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
||||
echo >&2 "Couldn't get number of tokens from ./main output!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
|
||||
|
||||
if ((n_tokens > CTX_ROTATE_POINT)); then
|
||||
tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
|
||||
fi
|
||||
|
||||
# Update cache for next prompt in background, ideally during user input
|
||||
./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
||||
--prompt-cache "$NEXT_PROMPT_CACHE" \
|
||||
--file "$NEXT_PROMPT_FILE" \
|
||||
--n_predict 1 &
|
||||
done
|
||||
@@ -321,12 +321,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
} else if (arg == "--n-parts") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_parts = std::stoi(argv[i]);
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
exit(0);
|
||||
@@ -357,7 +351,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
}
|
||||
if (params.prompt_cache_all &&
|
||||
(params.interactive || params.interactive_first ||
|
||||
params.instruct || params.antiprompt.size())) {
|
||||
params.instruct)) {
|
||||
fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
exit(1);
|
||||
@@ -379,8 +373,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||
fprintf(stderr, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
||||
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
|
||||
fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
|
||||
fprintf(stderr, " specified more than once for multiple prompts).\n");
|
||||
fprintf(stderr, " halt generation at PROMPT, return control in interactive mode\n");
|
||||
fprintf(stderr, " (can be specified more than once for multiple prompts).\n");
|
||||
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
|
||||
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
@@ -418,7 +412,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
|
||||
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value\n");
|
||||
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
|
||||
fprintf(stderr, " --n-parts N number of model parts (default: -1 = determine from dimensions)\n");
|
||||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
||||
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||
@@ -473,7 +466,6 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
lparams.n_parts = params.n_parts;
|
||||
lparams.n_gpu_layers = params.n_gpu_layers;
|
||||
lparams.seed = params.seed;
|
||||
lparams.f16_kv = params.memory_f16;
|
||||
@@ -586,6 +578,37 @@ void console_set_color(console_state & con_st, console_color_t color) {
|
||||
}
|
||||
|
||||
char32_t getchar32() {
|
||||
#if defined(_WIN32)
|
||||
HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
|
||||
wchar_t high_surrogate = 0;
|
||||
|
||||
while (true) {
|
||||
INPUT_RECORD record;
|
||||
DWORD count;
|
||||
if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
|
||||
return WEOF;
|
||||
}
|
||||
|
||||
if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
|
||||
wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
|
||||
if (wc == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
|
||||
high_surrogate = wc;
|
||||
continue;
|
||||
} else if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
|
||||
if (high_surrogate != 0) { // Check if we have a high surrogate
|
||||
return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
|
||||
}
|
||||
}
|
||||
|
||||
high_surrogate = 0; // Reset the high surrogate
|
||||
return static_cast<char32_t>(wc);
|
||||
}
|
||||
}
|
||||
#else
|
||||
wchar_t wc = getwchar();
|
||||
if (static_cast<wint_t>(wc) == WEOF) {
|
||||
return WEOF;
|
||||
@@ -604,6 +627,7 @@ char32_t getchar32() {
|
||||
#endif
|
||||
|
||||
return static_cast<char32_t>(wc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void pop_cursor(console_state & con_st) {
|
||||
@@ -757,7 +781,7 @@ bool console_readline(console_state & con_st, std::string & line) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (input_char == WEOF || input_char == 0x04 /* Ctrl+D*/) {
|
||||
if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
|
||||
end_of_stream = true;
|
||||
break;
|
||||
}
|
||||
@@ -772,7 +796,7 @@ bool console_readline(console_state & con_st, std::string & line) {
|
||||
char32_t code = getchar32();
|
||||
if (code == '[' || code == 0x1B) {
|
||||
// Discard the rest of the escape sequence
|
||||
while ((code = getchar32()) != WEOF) {
|
||||
while ((code = getchar32()) != (char32_t) WEOF) {
|
||||
if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -24,7 +24,6 @@ struct gpt_params {
|
||||
int32_t seed = -1; // RNG seed
|
||||
int32_t n_threads = get_num_physical_cores();
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
@@ -45,15 +44,15 @@ struct gpt_params {
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
|
||||
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
|
||||
std::string prompt = "";
|
||||
std::string model = "models/7B/ggml-model.bin"; // model path
|
||||
std::string prompt = "";
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
std::string input_suffix = ""; // string to suffix user inputs with
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
|
||||
std::string lora_adapter = ""; // lora adapter path
|
||||
std::string lora_base = ""; // base model path for the lora adapter
|
||||
std::string lora_base = ""; // base model path for the lora adapter
|
||||
|
||||
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
||||
bool random_prompt = false; // do not randomize prompt if none provided
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
params.model = "models/llama-7B/ggml-model.bin";
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
@@ -32,6 +31,8 @@ int main(int argc, char ** argv) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
}
|
||||
|
||||
llama_init_backend();
|
||||
|
||||
llama_context * ctx;
|
||||
|
||||
// load the model
|
||||
|
||||
@@ -50,7 +50,6 @@ void sigint_handler(int signo) {
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
params.model = "models/llama-7B/ggml-model.bin";
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
@@ -97,8 +96,7 @@ int main(int argc, char ** argv) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
}
|
||||
|
||||
// params.prompt = R"(// this function checks if the number n is prime
|
||||
//bool is_prime(int n) {)";
|
||||
llama_init_backend();
|
||||
|
||||
llama_context * ctx;
|
||||
g_ctx = &ctx;
|
||||
@@ -209,8 +207,8 @@ int main(int argc, char ** argv) {
|
||||
params.antiprompt.push_back("### Instruction:\n\n");
|
||||
}
|
||||
|
||||
// enable interactive mode if reverse prompt or interactive start is specified
|
||||
if (params.antiprompt.size() != 0 || params.interactive_first) {
|
||||
// enable interactive mode if interactive start is specified
|
||||
if (params.interactive_first) {
|
||||
params.interactive = true;
|
||||
}
|
||||
|
||||
@@ -242,7 +240,7 @@ int main(int argc, char ** argv) {
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
auto console_ctrl_handler = [](DWORD ctrl_type) -> BOOL {
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
@@ -306,7 +304,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
|
||||
while (n_remain != 0 || params.interactive) {
|
||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||
// predict
|
||||
if (embd.size() > 0) {
|
||||
// infinite text generation via context swapping
|
||||
@@ -504,9 +502,8 @@ int main(int argc, char ** argv) {
|
||||
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
|
||||
}
|
||||
|
||||
// in interactive mode, and not currently processing queued inputs;
|
||||
// check if we should prompt the user for more
|
||||
if (params.interactive && (int) embd_inp.size() <= n_consumed) {
|
||||
// if not currently processing queued inputs;
|
||||
if ((int) embd_inp.size() <= n_consumed) {
|
||||
|
||||
// check for reverse prompt
|
||||
if (params.antiprompt.size()) {
|
||||
@@ -517,10 +514,21 @@ int main(int argc, char ** argv) {
|
||||
|
||||
is_antiprompt = false;
|
||||
// Check if each of the reverse prompts appears at the end of the output.
|
||||
// If we're not running interactively, the reverse prompt might be tokenized with some following characters
|
||||
// so we'll compensate for that by widening the search window a bit.
|
||||
for (std::string & antiprompt : params.antiprompt) {
|
||||
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
|
||||
is_interacting = true;
|
||||
size_t extra_padding = params.interactive ? 0 : 2;
|
||||
size_t search_start_pos = last_output.length() > static_cast<size_t>(antiprompt.length() + extra_padding)
|
||||
? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
|
||||
: 0;
|
||||
|
||||
if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
|
||||
if (params.interactive) {
|
||||
is_interacting = true;
|
||||
console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
|
||||
}
|
||||
is_antiprompt = true;
|
||||
fflush(stdout);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -116,7 +116,6 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
params.model = "models/llama-7B/ggml-model.bin";
|
||||
|
||||
params.n_batch = 512;
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
@@ -144,6 +143,8 @@ int main(int argc, char ** argv) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
}
|
||||
|
||||
llama_init_backend();
|
||||
|
||||
llama_context * ctx;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
|
||||
@@ -321,7 +321,6 @@ int main(int argc, char ** argv) {
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = 256;
|
||||
lparams.n_parts = 1;
|
||||
lparams.seed = 1;
|
||||
lparams.f16_kv = false;
|
||||
lparams.use_mlock = false;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <map>
|
||||
#include <string>
|
||||
@@ -42,8 +42,6 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st
|
||||
// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
|
||||
//
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
|
||||
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
|
||||
@@ -52,12 +50,7 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// needed to initialize f16 tables
|
||||
{
|
||||
struct ggml_init_params params = { 0, NULL, false };
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
ggml_free(ctx);
|
||||
}
|
||||
llama_init_backend();
|
||||
|
||||
// parse command line arguments
|
||||
const std::string fname_inp = argv[1];
|
||||
@@ -116,25 +109,25 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
const int64_t t_main_start_us = llama_time_us();
|
||||
|
||||
int64_t t_quantize_us = 0;
|
||||
|
||||
// load the model
|
||||
{
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
const int64_t t_start_us = llama_time_us();
|
||||
|
||||
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
|
||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
t_quantize_us = ggml_time_us() - t_start_us;
|
||||
t_quantize_us = llama_time_us() - t_start_us;
|
||||
}
|
||||
|
||||
// report timing
|
||||
{
|
||||
const int64_t t_main_end_us = ggml_time_us();
|
||||
const int64_t t_main_end_us = llama_time_us();
|
||||
|
||||
printf("\n");
|
||||
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
params.model = "models/llama-7B/ggml-model.bin";
|
||||
params.seed = 42;
|
||||
params.n_threads = 4;
|
||||
params.repeat_last_n = 64;
|
||||
@@ -27,7 +26,6 @@ int main(int argc, char ** argv) {
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
lparams.n_parts = params.n_parts;
|
||||
lparams.seed = params.seed;
|
||||
lparams.f16_kv = params.memory_f16;
|
||||
lparams.use_mmap = params.use_mmap;
|
||||
|
||||
137
ggml-cuda.cu
137
ggml-cuda.cu
@@ -42,19 +42,19 @@ typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y,
|
||||
#define QK4_0 32
|
||||
#define QR4_0 2
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
half d; // delta
|
||||
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||
} block_q4_0;
|
||||
static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
||||
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
||||
|
||||
#define QK4_1 32
|
||||
#define QR4_1 2
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
float m; // min
|
||||
half d; // delta
|
||||
half m; // min
|
||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||
} block_q4_1;
|
||||
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||
static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||
|
||||
#define QK5_0 32
|
||||
#define QR5_0 2
|
||||
@@ -78,14 +78,24 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
||||
#define QK8_0 32
|
||||
#define QR8_0 1
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
half d; // delta
|
||||
int8_t qs[QK8_0]; // quants
|
||||
} block_q8_0;
|
||||
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
|
||||
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
||||
|
||||
#define CUDA_MUL_BLOCK_SIZE 256
|
||||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||
#define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec
|
||||
|
||||
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= kx) {
|
||||
return;
|
||||
}
|
||||
dst[i] = x[i] * y[i%ky];
|
||||
}
|
||||
|
||||
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
|
||||
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||
|
||||
@@ -228,6 +238,11 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
||||
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
|
||||
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
||||
}
|
||||
|
||||
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
@@ -467,6 +482,67 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_CUDA);
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[2];
|
||||
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
const int64_t ne13 = src1->ne[3];
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
size_t x_size, d_size;
|
||||
|
||||
float * d_X = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
||||
float * d_Y = (float *) src1->data; // src1 is already on device, broadcasted.
|
||||
float * d_D = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
const int i0 = i03*ne02 + i02;
|
||||
float * c_X2 = d_X + i0*ne01*ne00;
|
||||
float * c_D2 = d_D + i0*ne01*ne00;
|
||||
|
||||
cudaStream_t cudaStream = g_cudaStreams[i0 % GGML_CUDA_MAX_STREAMS];
|
||||
cudaStream_t cudaStream2 = g_cudaStreams2[i0 % GGML_CUDA_MAX_STREAMS];
|
||||
cudaEvent_t cudaEvent = g_cudaEvents[i0 % GGML_CUDA_MAX_EVENTS];
|
||||
|
||||
// copy src0 to device
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X2, src0, i03, i02, cudaStream2));
|
||||
CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
|
||||
|
||||
// wait for data
|
||||
CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
|
||||
|
||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||
const int64_t i13 = i03%ne13;
|
||||
const int64_t i12 = i02%ne12;
|
||||
const int64_t i11 = i01%ne11;
|
||||
const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
|
||||
|
||||
float * c_X1 = c_X2 + i01*ne00;
|
||||
float * c_Y = d_Y + i1*ne10;
|
||||
float * c_D1 = c_D2 + i01*ne00;
|
||||
|
||||
// compute
|
||||
mul_f32_cuda(c_X1, c_Y, c_D1, ne00, ne10, cudaStream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
CUDA_CHECK(cudaMemcpyAsync(d, c_D2, sizeof(float)*ne00*ne01, cudaMemcpyDeviceToHost, cudaStream));
|
||||
}
|
||||
}
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
ggml_cuda_pool_free(d_X, x_size);
|
||||
ggml_cuda_pool_free(d_D, d_size);
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
@@ -724,6 +800,11 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
|
||||
ggml_cuda_pool_free(d_Q, q_size);
|
||||
}
|
||||
|
||||
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
ggml_cuda_mul_f32(src0, src1, dst);
|
||||
}
|
||||
|
||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
|
||||
@@ -797,14 +878,48 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
|
||||
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
||||
|
||||
size_t q_size;
|
||||
char * d_Q = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
|
||||
char * dst = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
|
||||
|
||||
cudaStream_t cudaStream2 = g_cudaStreams2[0];
|
||||
|
||||
// copy tensor to device
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, tensor, 0, 0, cudaStream2));
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||
int i = i3*ne2 + i2;
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(dst + i*ne0*ne1, tensor, i3, i2, cudaStream2));
|
||||
}
|
||||
}
|
||||
|
||||
tensor->data = d_Q;
|
||||
tensor->data = dst;
|
||||
tensor->backend = GGML_BACKEND_CUDA;
|
||||
}
|
||||
|
||||
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
|
||||
FILE * fp = fopen(fname, "rb");
|
||||
|
||||
const size_t size = ggml_nbytes(tensor);
|
||||
|
||||
void * buf;
|
||||
CUDA_CHECK(cudaMalloc(&buf, size));
|
||||
void * buf_host = malloc(size);
|
||||
|
||||
#ifdef _WIN32
|
||||
int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
|
||||
#else
|
||||
int ret = fseek(fp, (long) offset, SEEK_SET);
|
||||
#endif
|
||||
GGML_ASSERT(ret == 0); // same
|
||||
|
||||
size_t ret2 = fread(buf_host, size, 1, fp);
|
||||
if (ret2 != 1) {
|
||||
fprintf(stderr, "unexpectedly reached end of file");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
tensor->data = buf;
|
||||
free(buf_host);
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ extern "C" {
|
||||
|
||||
void ggml_init_cublas(void);
|
||||
|
||||
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||
@@ -15,6 +16,7 @@ void * ggml_cuda_host_malloc(size_t size);
|
||||
void ggml_cuda_host_free(void * ptr);
|
||||
|
||||
void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
|
||||
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
375
ggml-opencl.c
375
ggml-opencl.c
@@ -10,87 +10,77 @@
|
||||
#include "ggml.h"
|
||||
|
||||
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
||||
const char * clblast_dequant = MULTILINE_QUOTE(
|
||||
static const char * program_source = MULTILINE_QUOTE(
|
||||
|
||||
typedef char int8_t;
|
||||
typedef uchar uint8_t;
|
||||
typedef int int32_t;
|
||||
typedef uint uint32_t;
|
||||
|
||||
constant uint QK4_0 = 32;
|
||||
struct block_q4_0
|
||||
struct __attribute__ ((packed)) block_q4_0
|
||||
{
|
||||
float d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
half d;
|
||||
uint8_t qs[16]; /* QK4_0 / 2 */
|
||||
};
|
||||
|
||||
constant uint QK4_1 = 32;
|
||||
struct block_q4_1
|
||||
struct __attribute__ ((packed)) block_q4_1
|
||||
{
|
||||
float d;
|
||||
float m;
|
||||
uint8_t qs[QK4_1 / 2];
|
||||
half d;
|
||||
half m;
|
||||
uint8_t qs[16]; /* QK4_1 / 2 */
|
||||
};
|
||||
|
||||
constant uint QK5_0 = 32;
|
||||
struct __attribute__ ((packed)) block_q5_0
|
||||
{
|
||||
half d;
|
||||
uint32_t qh;
|
||||
uint8_t qs[QK5_0 / 2];
|
||||
uint8_t qs[16]; /* QK5_0 / 2 */
|
||||
};
|
||||
|
||||
constant uint QK5_1 = 32;
|
||||
struct block_q5_1
|
||||
struct __attribute__ ((packed)) block_q5_1
|
||||
{
|
||||
half d;
|
||||
half m;
|
||||
uint32_t qh;
|
||||
uint8_t qs[QK5_1 / 2];
|
||||
uint8_t qs[16]; /* QK5_1 / 2 */
|
||||
};
|
||||
|
||||
constant uint QK8_0 = 32;
|
||||
struct block_q8_0
|
||||
struct __attribute__ ((packed)) block_q8_0
|
||||
{
|
||||
float d;
|
||||
uint8_t qs[QK8_0];
|
||||
half d;
|
||||
int8_t qs[32]; /* QK8_0 */
|
||||
};
|
||||
|
||||
|
||||
__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
|
||||
constant uint qk = QK4_0;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint i = get_global_id(0) / 32; /* QK4_0 */
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = x[i].d;
|
||||
const float d = vload_half(0, (__global half*) &x[i].d);
|
||||
|
||||
const int x0 = (x[i].qs[j] & 0xf) - 8;
|
||||
const int x1 = (x[i].qs[j] >> 4) - 8;
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d;
|
||||
y[i*qk + j + qk/2] = x1*d;
|
||||
y[i*32 + j + 0 ] = x0*d;
|
||||
y[i*32 + j + 16] = x1*d;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
|
||||
constant uint qk = QK4_1;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint i = get_global_id(0) / 32; /* QK4_1 */
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = x[i].d;
|
||||
const float m = x[i].m;
|
||||
const float d = vload_half(0, (__global half*) &x[i].d);
|
||||
const float m = vload_half(0, (__global half*) &x[i].m);
|
||||
|
||||
const int x0 = (x[i].qs[j] & 0xf);
|
||||
const int x1 = (x[i].qs[j] >> 4);
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d + m;
|
||||
y[i*qk + j + qk/2] = x1*d + m;
|
||||
y[i*32 + j + 0 ] = x0*d + m;
|
||||
y[i*32 + j + 16] = x1*d + m;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
|
||||
constant uint qk = QK5_0;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint i = get_global_id(0) / 32; /* QK5_0 */
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = vload_half(0, (__global half*) &x[i].d);
|
||||
@@ -103,14 +93,12 @@ __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float*
|
||||
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
|
||||
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d;
|
||||
y[i*qk + j + qk/2] = x1*d;
|
||||
y[i*32 + j + 0 ] = x0*d;
|
||||
y[i*32 + j + 16] = x1*d;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
|
||||
constant uint qk = QK5_1;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint i = get_global_id(0) / 32; /* QK5_1 */
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = vload_half(0, (__global half*) &x[i].d);
|
||||
@@ -124,28 +112,38 @@ __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float*
|
||||
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
|
||||
const int x1 = (x[i].qs[j] >> 4) | xh_1;
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d + m;
|
||||
y[i*qk + j + qk/2] = x1*d + m;
|
||||
y[i*32 + j + 0 ] = x0*d + m;
|
||||
y[i*32 + j + 16] = x1*d + m;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
|
||||
constant uint qk = QK8_0;
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint i = get_global_id(0) / 32; /* QK8_0 */
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = x[i].d;
|
||||
y[i*qk + j] = x[i].qs[j]*d;
|
||||
const float d = vload_half(0, (__global half*) &x[i].d);
|
||||
y[i*32 + j] = x[i].qs[j]*d;
|
||||
}
|
||||
|
||||
);
|
||||
|
||||
#define CL_CHECK(err, name) \
|
||||
do { \
|
||||
cl_int err_ = (err); \
|
||||
if (err_ != CL_SUCCESS) { \
|
||||
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
#define CL_CHECK(err) \
|
||||
do { \
|
||||
cl_int err_ = (err); \
|
||||
if (err_ != CL_SUCCESS) { \
|
||||
fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
|
||||
#err, err_, __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define CLBLAST_CHECK(err) \
|
||||
do { \
|
||||
CLBlastStatusCode err_ = (err); \
|
||||
if (err_ != CLBlastSuccess) { \
|
||||
fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
|
||||
#err, err_, __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static cl_platform_id platform;
|
||||
@@ -188,48 +186,174 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
|
||||
|
||||
void ggml_cl_init(void) {
|
||||
cl_int err = 0;
|
||||
char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
|
||||
char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
|
||||
int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
|
||||
int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
|
||||
printf("\nInitializing CLBlast (First Run)...");
|
||||
printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
|
||||
cl_uint num_platforms;
|
||||
clGetPlatformIDs(0, NULL, &num_platforms);
|
||||
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
|
||||
clGetPlatformIDs(num_platforms, platforms, NULL);
|
||||
platform = platforms[plat_num];
|
||||
char platform_buffer[1024];
|
||||
clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
|
||||
cl_uint num_devices;
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
|
||||
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
|
||||
device = devices[dev_num];
|
||||
char device_buffer[1024];
|
||||
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
|
||||
printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
|
||||
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
|
||||
CL_CHECK(err, "clCreateContext");
|
||||
queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
|
||||
CL_CHECK(err, "clCreateCommandQueue");
|
||||
|
||||
free(platforms);
|
||||
free(devices);
|
||||
struct cl_device;
|
||||
struct cl_platform {
|
||||
cl_platform_id id;
|
||||
unsigned number;
|
||||
char name[128];
|
||||
char vendor[128];
|
||||
struct cl_device * devices;
|
||||
unsigned n_devices;
|
||||
struct cl_device * default_device;
|
||||
};
|
||||
|
||||
program = build_program_from_source(context, device, clblast_dequant);
|
||||
struct cl_device {
|
||||
struct cl_platform * platform;
|
||||
cl_device_id id;
|
||||
unsigned number;
|
||||
cl_device_type type;
|
||||
char name[128];
|
||||
};
|
||||
|
||||
enum { NPLAT = 16, NDEV = 16 };
|
||||
|
||||
struct cl_platform platforms[NPLAT];
|
||||
unsigned n_platforms = 0;
|
||||
struct cl_device devices[NDEV];
|
||||
unsigned n_devices = 0;
|
||||
struct cl_device * default_device = NULL;
|
||||
|
||||
platform = NULL;
|
||||
device = NULL;
|
||||
|
||||
cl_platform_id platform_ids[NPLAT];
|
||||
CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));
|
||||
|
||||
for (unsigned i = 0; i < n_platforms; i++) {
|
||||
struct cl_platform * p = &platforms[i];
|
||||
p->number = i;
|
||||
p->id = platform_ids[i];
|
||||
CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
|
||||
CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
|
||||
|
||||
cl_device_id device_ids[NDEV];
|
||||
cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
|
||||
if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
|
||||
p->n_devices = 0;
|
||||
} else {
|
||||
CL_CHECK(clGetDeviceIDsError);
|
||||
}
|
||||
p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
|
||||
p->default_device = NULL;
|
||||
|
||||
for (unsigned j = 0; j < p->n_devices; j++) {
|
||||
struct cl_device * d = &devices[n_devices];
|
||||
d->number = n_devices++;
|
||||
d->id = device_ids[j];
|
||||
d->platform = p;
|
||||
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
|
||||
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
|
||||
|
||||
if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
|
||||
p->default_device = d;
|
||||
}
|
||||
}
|
||||
|
||||
if (default_device == NULL && p->default_device != NULL) {
|
||||
default_device = p->default_device;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_devices == 0) {
|
||||
fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
|
||||
char * user_device_string = getenv("GGML_OPENCL_DEVICE");
|
||||
int user_platform_number = -1;
|
||||
int user_device_number = -1;
|
||||
|
||||
unsigned n;
|
||||
if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
|
||||
user_platform_number = (int)n;
|
||||
}
|
||||
if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
|
||||
user_device_number = (int)n;
|
||||
}
|
||||
|
||||
struct cl_device * selected_devices = devices;
|
||||
unsigned n_selected_devices = n_devices;
|
||||
|
||||
if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
|
||||
for (unsigned i = 0; i < n_platforms; i++) {
|
||||
struct cl_platform * p = &platforms[i];
|
||||
if (strstr(p->name, user_platform_string) != NULL ||
|
||||
strstr(p->vendor, user_platform_string) != NULL) {
|
||||
user_platform_number = (int)i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (user_platform_number == -1) {
|
||||
fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (user_platform_number != -1) {
|
||||
struct cl_platform * p = &platforms[user_platform_number];
|
||||
selected_devices = p->devices;
|
||||
n_selected_devices = p->n_devices;
|
||||
default_device = p->default_device;
|
||||
if (n_selected_devices == 0) {
|
||||
fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
|
||||
for (unsigned i = 0; i < n_selected_devices; i++) {
|
||||
struct cl_device * d = &selected_devices[i];
|
||||
if (strstr(d->name, user_device_string) != NULL) {
|
||||
user_device_number = d->number;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (user_device_number == -1) {
|
||||
fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (user_device_number != -1) {
|
||||
selected_devices = &devices[user_device_number];
|
||||
n_selected_devices = 1;
|
||||
default_device = &selected_devices[0];
|
||||
}
|
||||
|
||||
GGML_ASSERT(n_selected_devices > 0);
|
||||
|
||||
if (default_device == NULL) {
|
||||
default_device = &selected_devices[0];
|
||||
}
|
||||
|
||||
fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
|
||||
fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name);
|
||||
if (default_device->type != CL_DEVICE_TYPE_GPU) {
|
||||
fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
|
||||
}
|
||||
|
||||
platform = default_device->platform->id;
|
||||
device = default_device->id;
|
||||
|
||||
cl_context_properties properties[] = {
|
||||
(intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
|
||||
};
|
||||
|
||||
CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
|
||||
|
||||
CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
|
||||
(err != CL_INVALID_PROPERTY && err != CL_INVALID_VALUE ? err :
|
||||
(queue = clCreateCommandQueue(context, device, 0, &err), err)
|
||||
)));
|
||||
|
||||
program = build_program_from_source(context, device, program_source);
|
||||
|
||||
// Prepare dequantize kernels
|
||||
kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
CL_CHECK((kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err), err));
|
||||
CL_CHECK((kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err), err));
|
||||
CL_CHECK((kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
|
||||
CL_CHECK((kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
|
||||
CL_CHECK((kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
|
||||
}
|
||||
|
||||
static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
|
||||
@@ -242,9 +366,8 @@ static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags
|
||||
clReleaseMemObject(*buf);
|
||||
}
|
||||
cl_int err;
|
||||
*buf = clCreateBuffer(context, flags, req_size, NULL, &err);
|
||||
CL_CHECK((*buf = clCreateBuffer(context, flags, req_size, NULL, &err), err));
|
||||
*cur_size = req_size;
|
||||
CL_CHECK(err, "clCreateBuffer");
|
||||
}
|
||||
|
||||
void ggml_cl_sgemm_wrapper(
|
||||
@@ -253,7 +376,6 @@ void ggml_cl_sgemm_wrapper(
|
||||
const float alpha, const void *host_a, const int lda,
|
||||
const float *host_b, const int ldb, const float beta,
|
||||
float *host_c, const int ldc, const int btype) {
|
||||
cl_int err = 0;
|
||||
|
||||
cl_kernel kernel;
|
||||
size_t global = n * k, local, size_qb;
|
||||
@@ -267,13 +389,13 @@ void ggml_cl_sgemm_wrapper(
|
||||
dequant = true;
|
||||
kernel = kernel_q4_0;
|
||||
local = 16;
|
||||
size_qb = global * (sizeof(float) + local) / 32;
|
||||
size_qb = global * (sizeof(ggml_fp16_t) + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
dequant = true;
|
||||
kernel = kernel_q4_1;
|
||||
local = 16;
|
||||
size_qb = global * (sizeof(float) * 2 + local) / 32;
|
||||
size_qb = global * (sizeof(ggml_fp16_t) * 2 + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
dequant = true;
|
||||
@@ -291,7 +413,7 @@ void ggml_cl_sgemm_wrapper(
|
||||
dequant = true;
|
||||
kernel = kernel_q8_0;
|
||||
local = 32;
|
||||
size_qb = global * (sizeof(float) + local) / 32;
|
||||
size_qb = global * (sizeof(ggml_fp16_t) + local) / 32;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
|
||||
@@ -313,49 +435,40 @@ void ggml_cl_sgemm_wrapper(
|
||||
cl_event ev_a, ev_qb, ev_b;
|
||||
|
||||
if (dequant) {
|
||||
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
|
||||
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
|
||||
CL_CHECK(err, "clSetKernelArg");
|
||||
err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
|
||||
CL_CHECK(err, "clEnqueueWriteBuffer qb");
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b));
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb));
|
||||
} else {
|
||||
err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
|
||||
CL_CHECK(err, "clEnqueueWriteBuffer b");
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b));
|
||||
}
|
||||
|
||||
err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
|
||||
CL_CHECK(err, "clEnqueueWriteBuffer a");
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a));
|
||||
if (dequant) {
|
||||
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
|
||||
CL_CHECK(err, "clEnqueueNDRangeKernel");
|
||||
clReleaseEvent(ev_qb);
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b));
|
||||
CL_CHECK(clReleaseEvent(ev_qb));
|
||||
}
|
||||
clWaitForEvents(1, &ev_a);
|
||||
clWaitForEvents(1, &ev_b);
|
||||
clReleaseEvent(ev_a);
|
||||
clReleaseEvent(ev_b);
|
||||
CL_CHECK(clWaitForEvents(1, &ev_a));
|
||||
CL_CHECK(clWaitForEvents(1, &ev_b));
|
||||
CL_CHECK(clReleaseEvent(ev_a));
|
||||
CL_CHECK(clReleaseEvent(ev_b));
|
||||
|
||||
cl_event ev_sgemm;
|
||||
CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
|
||||
(CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
|
||||
m, n, k,
|
||||
alpha,
|
||||
cl_buffer_a, 0, lda,
|
||||
cl_buffer_b, 0, ldb,
|
||||
beta,
|
||||
cl_buffer_c, 0, ldc,
|
||||
&queue, &ev_sgemm);
|
||||
|
||||
if (status != CLBlastSuccess) {
|
||||
fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
|
||||
abort();
|
||||
}
|
||||
CLBLAST_CHECK(CLBlastSgemm(
|
||||
(CLBlastLayout)order,
|
||||
(CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
|
||||
m, n, k,
|
||||
alpha,
|
||||
cl_buffer_a, 0, lda,
|
||||
cl_buffer_b, 0, ldb,
|
||||
beta,
|
||||
cl_buffer_c, 0, ldc,
|
||||
&queue, &ev_sgemm));
|
||||
|
||||
cl_event ev_c;
|
||||
clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c));
|
||||
|
||||
// Wait for completion
|
||||
clWaitForEvents(1, &ev_c);
|
||||
clReleaseEvent(ev_sgemm);
|
||||
clReleaseEvent(ev_c);
|
||||
CL_CHECK(clWaitForEvents(1, &ev_c));
|
||||
CL_CHECK(clReleaseEvent(ev_sgemm));
|
||||
CL_CHECK(clReleaseEvent(ev_c));
|
||||
}
|
||||
|
||||
16
ggml.h
16
ggml.h
@@ -190,7 +190,7 @@
|
||||
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
||||
#define GGML_FILE_VERSION 1
|
||||
|
||||
#define GGML_QNT_VERSION 1 // bump this on quantization format changes
|
||||
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
|
||||
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
||||
|
||||
#define GGML_MAX_DIMS 4
|
||||
@@ -313,6 +313,7 @@ extern "C" {
|
||||
GGML_OP_ROPE,
|
||||
GGML_OP_ROPE_BACK,
|
||||
GGML_OP_ALIBI,
|
||||
GGML_OP_CLAMP,
|
||||
GGML_OP_CONV_1D_1S,
|
||||
GGML_OP_CONV_1D_2S,
|
||||
|
||||
@@ -849,7 +850,7 @@ extern "C" {
|
||||
int n_past);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
|
||||
GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past);
|
||||
@@ -897,7 +898,16 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_head);
|
||||
int n_head,
|
||||
float bias_max);
|
||||
|
||||
// clamp
|
||||
// in-place, returns view(a)
|
||||
struct ggml_tensor * ggml_clamp(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
float min,
|
||||
float max);
|
||||
|
||||
// padding = 1
|
||||
// TODO: we don't support extra parameters for now
|
||||
|
||||
46
llama-util.h
46
llama-util.h
@@ -101,12 +101,12 @@ struct llama_file {
|
||||
LLAMA_ASSERT(ret == 0); // same
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t size) {
|
||||
if (size == 0) {
|
||||
void read_raw(void * ptr, size_t len) const {
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
std::size_t ret = std::fread(ptr, size, 1, fp);
|
||||
std::size_t ret = std::fread(ptr, len, 1, fp);
|
||||
if (ferror(fp)) {
|
||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||
}
|
||||
@@ -127,12 +127,12 @@ struct llama_file {
|
||||
return std::string(chars.data(), len);
|
||||
}
|
||||
|
||||
void write_raw(const void * ptr, size_t size) {
|
||||
if (size == 0) {
|
||||
void write_raw(const void * ptr, size_t len) const {
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
size_t ret = std::fwrite(ptr, size, 1, fp);
|
||||
size_t ret = std::fwrite(ptr, len, 1, fp);
|
||||
if (ret != 1) {
|
||||
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
||||
}
|
||||
@@ -172,7 +172,7 @@ struct llama_mmap {
|
||||
#ifdef _POSIX_MAPPED_FILES
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
||||
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
|
||||
size = file->size;
|
||||
int fd = fileno(file->fp);
|
||||
int flags = MAP_SHARED;
|
||||
@@ -184,9 +184,9 @@ struct llama_mmap {
|
||||
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
||||
}
|
||||
|
||||
if (prefetch) {
|
||||
if (prefetch > 0) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
||||
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
@@ -267,9 +267,9 @@ struct llama_mlock {
|
||||
}
|
||||
}
|
||||
|
||||
void init(void * addr) {
|
||||
LLAMA_ASSERT(this->addr == NULL && this->size == 0);
|
||||
this->addr = addr;
|
||||
void init(void * ptr) {
|
||||
LLAMA_ASSERT(addr == NULL && size == 0);
|
||||
addr = ptr;
|
||||
}
|
||||
|
||||
void grow_to(size_t target_size) {
|
||||
@@ -340,14 +340,14 @@ struct llama_mlock {
|
||||
return (size_t) si.dwPageSize;
|
||||
}
|
||||
|
||||
bool raw_lock(void * addr, size_t size) {
|
||||
bool raw_lock(void * ptr, size_t len) {
|
||||
for (int tries = 1; ; tries++) {
|
||||
if (VirtualLock(addr, size)) {
|
||||
if (VirtualLock(ptr, len)) {
|
||||
return true;
|
||||
}
|
||||
if (tries == 2) {
|
||||
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
||||
size, this->size, llama_format_win_err(GetLastError()).c_str());
|
||||
len, size, llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -363,7 +363,7 @@ struct llama_mlock {
|
||||
// is equal to the number of pages in its minimum working set minus
|
||||
// a small overhead."
|
||||
// Hopefully a megabyte is enough overhead:
|
||||
size_t increment = size + 1048576;
|
||||
size_t increment = len + 1048576;
|
||||
// The minimum must be <= the maximum, so we need to increase both:
|
||||
min_ws_size += increment;
|
||||
max_ws_size += increment;
|
||||
@@ -375,8 +375,8 @@ struct llama_mlock {
|
||||
}
|
||||
}
|
||||
|
||||
void raw_unlock(void * addr, size_t size) {
|
||||
if (!VirtualUnlock(addr, size)) {
|
||||
void raw_unlock(void * ptr, size_t len) {
|
||||
if (!VirtualUnlock(ptr, len)) {
|
||||
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
@@ -388,12 +388,12 @@ struct llama_mlock {
|
||||
return (size_t) 65536;
|
||||
}
|
||||
|
||||
bool raw_lock(const void * addr, size_t size) {
|
||||
bool raw_lock(const void * addr, size_t len) {
|
||||
fprintf(stderr, "warning: mlock not supported on this system\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
void raw_unlock(const void * addr, size_t size) {}
|
||||
void raw_unlock(const void * addr, size_t len) {}
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -404,10 +404,10 @@ struct llama_buffer {
|
||||
|
||||
llama_buffer() = default;
|
||||
|
||||
void resize(size_t size) {
|
||||
void resize(size_t len) {
|
||||
delete[] addr;
|
||||
addr = new uint8_t[size];
|
||||
this->size = size;
|
||||
addr = new uint8_t[len];
|
||||
size = len;
|
||||
}
|
||||
|
||||
~llama_buffer() {
|
||||
|
||||
282
llama.cpp
282
llama.cpp
@@ -1,6 +1,7 @@
|
||||
// Defines fileno on msys:
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#endif
|
||||
@@ -45,6 +46,7 @@ enum e_model {
|
||||
MODEL_65B,
|
||||
};
|
||||
|
||||
|
||||
static const size_t MB = 1024*1024;
|
||||
|
||||
// computed for n_ctx == 2048
|
||||
@@ -110,7 +112,7 @@ struct llama_hparams {
|
||||
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
return memcmp(this, &other, sizeof(llama_hparams));
|
||||
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -406,6 +408,7 @@ enum llama_file_version {
|
||||
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
||||
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
||||
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
||||
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
|
||||
};
|
||||
|
||||
struct llama_file_loader {
|
||||
@@ -424,24 +427,30 @@ struct llama_file_loader {
|
||||
}
|
||||
void read_magic() {
|
||||
uint32_t magic = file.read_u32();
|
||||
uint32_t version = 0;
|
||||
|
||||
if (magic != 'ggml') {
|
||||
version = file.read_u32();
|
||||
}
|
||||
|
||||
if (magic == 'ggml' && version == 0) {
|
||||
if (magic == LLAMA_FILE_MAGIC_GGML) {
|
||||
file_version = LLAMA_FILE_VERSION_GGML;
|
||||
} else if (magic == 'ggmf' && version == 1) {
|
||||
file_version = LLAMA_FILE_VERSION_GGMF_V1;
|
||||
} else if (magic == 'ggjt' && version == 1) {
|
||||
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
||||
} else if (magic == 'ggjt' && version == 2) {
|
||||
file_version = LLAMA_FILE_VERSION_GGJT_V2;
|
||||
} else {
|
||||
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
||||
magic, version);
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t version = file.read_u32();
|
||||
|
||||
switch (magic) {
|
||||
case LLAMA_FILE_MAGIC_GGMF:
|
||||
switch (version) {
|
||||
case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
|
||||
}
|
||||
break;
|
||||
case LLAMA_FILE_MAGIC_GGJT:
|
||||
switch (version) {
|
||||
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
|
||||
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
|
||||
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
|
||||
}
|
||||
}
|
||||
|
||||
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
||||
magic, version);
|
||||
}
|
||||
void read_hparams() {
|
||||
hparams.n_vocab = file.read_u32();
|
||||
@@ -499,7 +508,7 @@ struct llama_file_loader {
|
||||
|
||||
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
||||
// skip to the next multiple of 32 bytes
|
||||
file.seek(-file.tell() & 31, SEEK_CUR);
|
||||
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
||||
}
|
||||
shard.file_idx = file_idx;
|
||||
shard.file_off = file.tell();
|
||||
@@ -574,7 +583,7 @@ struct llama_file_saver {
|
||||
file.write_u32(new_type);
|
||||
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
||||
file.write_raw(tensor.name.data(), tensor.name.size());
|
||||
file.seek(-file.tell() & 31, SEEK_CUR);
|
||||
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
||||
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
||||
file.write_raw(new_data, new_size);
|
||||
}
|
||||
@@ -641,7 +650,7 @@ struct llama_model_loader {
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
|
||||
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
||||
auto it = tensors_map.name_to_idx.find(name);
|
||||
if (it == tensors_map.name_to_idx.end()) {
|
||||
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
||||
@@ -652,10 +661,10 @@ struct llama_model_loader {
|
||||
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
||||
}
|
||||
|
||||
return get_tensor_for(lt);
|
||||
return get_tensor_for(lt, backend);
|
||||
}
|
||||
|
||||
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
|
||||
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
||||
struct ggml_tensor * tensor;
|
||||
if (lt.ne.size() == 2) {
|
||||
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
||||
@@ -665,6 +674,7 @@ struct llama_model_loader {
|
||||
}
|
||||
ggml_set_name(tensor, lt.name.c_str());
|
||||
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
||||
tensor->backend = backend;
|
||||
lt.ggml_tensor = tensor;
|
||||
num_ggml_tensors_created++;
|
||||
return tensor;
|
||||
@@ -678,12 +688,16 @@ struct llama_model_loader {
|
||||
|
||||
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
||||
size_t data_size = 0;
|
||||
size_t prefetch_size = 0;
|
||||
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
||||
data_size += lt.size;
|
||||
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
||||
prefetch_size += lt.size;
|
||||
}
|
||||
}
|
||||
|
||||
if (use_mmap) {
|
||||
mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
|
||||
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
||||
if (!lmlock) {
|
||||
// Don't call the callback since the actual loading will be lazy
|
||||
// and we can't measure it.
|
||||
@@ -696,6 +710,9 @@ struct llama_model_loader {
|
||||
|
||||
size_t done_size = 0;
|
||||
for (llama_load_tensor & lt : tensors_map.tensors) {
|
||||
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
||||
continue;
|
||||
}
|
||||
if (progress_callback) {
|
||||
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
||||
}
|
||||
@@ -708,9 +725,6 @@ struct llama_model_loader {
|
||||
lmlock->grow_to(done_size);
|
||||
}
|
||||
}
|
||||
if (progress_callback) {
|
||||
progress_callback(1.0f, progress_callback_user_data);
|
||||
}
|
||||
}
|
||||
|
||||
void load_data_for(llama_load_tensor & lt) {
|
||||
@@ -812,10 +826,9 @@ static bool kv_cache_init(
|
||||
struct llama_context_params llama_context_default_params() {
|
||||
struct llama_context_params result = {
|
||||
/*.n_ctx =*/ 512,
|
||||
/*.n_parts =*/ -1,
|
||||
/*.gpu_layers =*/ 0,
|
||||
/*.seed =*/ -1,
|
||||
/*.f16_kv =*/ false,
|
||||
/*.f16_kv =*/ true,
|
||||
/*.logits_all =*/ false,
|
||||
/*.vocab_only =*/ false,
|
||||
/*.use_mmap =*/ true,
|
||||
@@ -836,6 +849,21 @@ bool llama_mlock_supported() {
|
||||
return llama_mlock::SUPPORTED;
|
||||
}
|
||||
|
||||
void llama_init_backend() {
|
||||
ggml_time_init();
|
||||
|
||||
// needed to initialize f16 tables
|
||||
{
|
||||
struct ggml_init_params params = { 0, NULL, false };
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
ggml_free(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
int64_t llama_time_us() {
|
||||
return ggml_time_us();
|
||||
}
|
||||
|
||||
//
|
||||
// model loading
|
||||
//
|
||||
@@ -845,7 +873,8 @@ static const char *llama_file_version_name(llama_file_version version) {
|
||||
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
||||
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
||||
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
||||
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
|
||||
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
|
||||
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
|
||||
}
|
||||
|
||||
return "unknown";
|
||||
@@ -925,11 +954,19 @@ static void llama_model_load_internal(
|
||||
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
||||
}
|
||||
|
||||
if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
|
||||
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
||||
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
||||
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
||||
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
||||
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
|
||||
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
||||
}
|
||||
}
|
||||
|
||||
if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
|
||||
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
||||
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
||||
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
||||
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -942,27 +979,7 @@ static void llama_model_load_internal(
|
||||
size_t ctx_size;
|
||||
size_t mmapped_size;
|
||||
ml->calc_sizes(&ctx_size, &mmapped_size);
|
||||
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
||||
|
||||
// print memory requirements
|
||||
{
|
||||
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
||||
|
||||
// this is the total memory required to run the inference
|
||||
const size_t mem_required =
|
||||
ctx_size +
|
||||
mmapped_size +
|
||||
MEM_REQ_SCRATCH0().at(model.type) +
|
||||
MEM_REQ_SCRATCH1().at(model.type) +
|
||||
MEM_REQ_EVAL().at(model.type);
|
||||
|
||||
// this is the memory required by one llama_state
|
||||
const size_t mem_required_state =
|
||||
scale*MEM_REQ_KV_SELF().at(model.type);
|
||||
|
||||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||
}
|
||||
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
||||
|
||||
// create the ggml context
|
||||
{
|
||||
@@ -984,7 +1001,14 @@ static void llama_model_load_internal(
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
|
||||
#else
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
||||
#endif
|
||||
|
||||
// prepare memory for the weights
|
||||
size_t vram_total = 0;
|
||||
{
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
@@ -992,33 +1016,87 @@ static void llama_model_load_internal(
|
||||
|
||||
ml->ggml_ctx = ctx;
|
||||
|
||||
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
||||
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
||||
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
||||
|
||||
// "output" tensor
|
||||
{
|
||||
ggml_backend backend_output;
|
||||
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
||||
backend_output = LLAMA_BACKEND_OFFLOAD;
|
||||
} else {
|
||||
backend_output = GGML_BACKEND_CPU;
|
||||
}
|
||||
|
||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
model.layers.resize(n_layer);
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
std::string layers_i = "layers." + std::to_string(i);
|
||||
|
||||
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
|
||||
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
||||
|
||||
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
|
||||
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
|
||||
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
|
||||
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
|
||||
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
|
||||
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
|
||||
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
|
||||
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
|
||||
|
||||
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
|
||||
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
||||
|
||||
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
|
||||
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
|
||||
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
|
||||
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
|
||||
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
|
||||
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
|
||||
|
||||
if (backend == GGML_BACKEND_CUDA) {
|
||||
vram_total +=
|
||||
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
||||
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
||||
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ml->done_getting_tensors();
|
||||
|
||||
// print memory requirements
|
||||
{
|
||||
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
||||
|
||||
// this is the total memory required to run the inference
|
||||
const size_t mem_required =
|
||||
ctx_size +
|
||||
mmapped_size - vram_total + // weights in VRAM not in memory
|
||||
MEM_REQ_SCRATCH0().at(model.type) +
|
||||
MEM_REQ_SCRATCH1().at(model.type) +
|
||||
MEM_REQ_EVAL().at(model.type);
|
||||
|
||||
// this is the memory required by one llama_state
|
||||
const size_t mem_required_state =
|
||||
scale*MEM_REQ_KV_SELF().at(model.type);
|
||||
|
||||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
||||
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
||||
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
||||
}
|
||||
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||
#else
|
||||
(void) n_gpu_layers;
|
||||
#endif
|
||||
}
|
||||
|
||||
// populate `tensors_by_name`
|
||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
||||
@@ -1026,36 +1104,34 @@ static void llama_model_load_internal(
|
||||
|
||||
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
||||
|
||||
model.mapping = std::move(ml->mapping);
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
{
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
||||
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||
|
||||
size_t vram_total = 0;
|
||||
|
||||
for (int i = 0; i < n_gpu; ++i) {
|
||||
const auto & layer = model.layers[i];
|
||||
|
||||
ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
||||
ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
||||
ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
||||
ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
||||
ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
||||
ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
||||
ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
||||
size_t done_size = 0;
|
||||
size_t data_size = 0;
|
||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||
data_size += lt.size;
|
||||
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
||||
done_size += lt.size;
|
||||
}
|
||||
}
|
||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
||||
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
||||
ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
|
||||
continue;
|
||||
}
|
||||
if (progress_callback) {
|
||||
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
||||
}
|
||||
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
||||
done_size += lt.size;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||
}
|
||||
#else
|
||||
(void) n_gpu_layers;
|
||||
#endif
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
if (progress_callback) {
|
||||
progress_callback(1.0f, progress_callback_user_data);
|
||||
}
|
||||
|
||||
model.mapping = std::move(ml->mapping);
|
||||
|
||||
// loading time will be recalculate after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
@@ -1154,10 +1230,8 @@ static bool llama_eval_internal(
|
||||
{
|
||||
cur = ggml_rms_norm(ctx0, inpL);
|
||||
|
||||
// cur = attention_norm*cur
|
||||
cur = ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
|
||||
cur);
|
||||
// cur = cur*attention_norm(broadcasted)
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
||||
}
|
||||
|
||||
// self-attention
|
||||
@@ -1264,10 +1338,8 @@ static bool llama_eval_internal(
|
||||
{
|
||||
cur = ggml_rms_norm(ctx0, inpFF);
|
||||
|
||||
// cur = ffn_norm*cur
|
||||
cur = ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
|
||||
cur);
|
||||
// cur = cur*ffn_norm(broadcasted)
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
||||
}
|
||||
|
||||
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
||||
@@ -1304,10 +1376,8 @@ static bool llama_eval_internal(
|
||||
|
||||
inpL = ggml_rms_norm(ctx0, inpL);
|
||||
|
||||
// inpL = norm*inpL
|
||||
inpL = ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.norm, inpL),
|
||||
inpL);
|
||||
// inpL = inpL*norm(broadcasted)
|
||||
inpL = ggml_mul(ctx0, inpL, model.norm);
|
||||
|
||||
embeddings = inpL;
|
||||
}
|
||||
@@ -2131,7 +2201,7 @@ struct llama_context * llama_init_from_file(
|
||||
unsigned * cur_percentage_p = (unsigned *) ctx;
|
||||
unsigned percentage = (unsigned) (100 * progress);
|
||||
while (percentage > *cur_percentage_p) {
|
||||
++*cur_percentage_p;
|
||||
*cur_percentage_p = percentage;
|
||||
fprintf(stderr, ".");
|
||||
fflush(stderr);
|
||||
if (percentage >= 100) {
|
||||
@@ -2224,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||
{
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic != 'ggla') {
|
||||
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
||||
fprintf(stderr, "%s: bad file magic\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -2288,7 +2358,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||
|
||||
// maybe this should in llama_model_loader
|
||||
if (model_loader->use_mmap) {
|
||||
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
|
||||
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2381,7 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||
}
|
||||
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
||||
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
||||
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
||||
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
||||
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
||||
model_loader->load_data_for(lt);
|
||||
lt.ggml_tensor->data = lt.data;
|
||||
@@ -2607,8 +2677,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||
}
|
||||
|
||||
// Sets the state reading from the specified source address
|
||||
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||
const uint8_t * inp = src;
|
||||
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||
uint8_t * inp = src;
|
||||
|
||||
// set rng
|
||||
{
|
||||
|
||||
48
llama.h
48
llama.h
@@ -19,10 +19,16 @@
|
||||
# define LLAMA_API
|
||||
#endif
|
||||
|
||||
#define LLAMA_FILE_VERSION 2
|
||||
#define LLAMA_FILE_MAGIC 'ggjt'
|
||||
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
|
||||
#define LLAMA_SESSION_MAGIC 'ggsn'
|
||||
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
||||
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
|
||||
#define LLAMA_FILE_VERSION 3
|
||||
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
||||
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
@@ -40,9 +46,9 @@ extern "C" {
|
||||
typedef int llama_token;
|
||||
|
||||
typedef struct llama_token_data {
|
||||
llama_token id; // token id
|
||||
float logit; // log-odds of the token
|
||||
float p; // probability of the token
|
||||
llama_token id; // token id
|
||||
float logit; // log-odds of the token
|
||||
float p; // probability of the token
|
||||
} llama_token_data;
|
||||
|
||||
typedef struct llama_token_data_array {
|
||||
@@ -55,7 +61,6 @@ extern "C" {
|
||||
|
||||
struct llama_context_params {
|
||||
int n_ctx; // text context
|
||||
int n_parts; // -1 for default
|
||||
int n_gpu_layers; // number of layers to store in VRAM
|
||||
int seed; // RNG seed, -1 for random
|
||||
|
||||
@@ -74,16 +79,16 @@ extern "C" {
|
||||
|
||||
// model file types
|
||||
enum llama_ftype {
|
||||
LLAMA_FTYPE_ALL_F32 = 0,
|
||||
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
LLAMA_FTYPE_ALL_F32 = 0,
|
||||
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
@@ -91,6 +96,13 @@ extern "C" {
|
||||
LLAMA_API bool llama_mmap_supported();
|
||||
LLAMA_API bool llama_mlock_supported();
|
||||
|
||||
// TODO: not great API - very likely to change
|
||||
// Initialize the llama + ggml backend
|
||||
// Call once at the start of the program
|
||||
LLAMA_API void llama_init_backend();
|
||||
|
||||
LLAMA_API int64_t llama_time_us();
|
||||
|
||||
// Various functions for loading a ggml llama model.
|
||||
// Allocate (almost) all memory needed for the model.
|
||||
// Return NULL on failure
|
||||
@@ -139,7 +151,7 @@ extern "C" {
|
||||
|
||||
// Set the state reading from the specified address
|
||||
// Returns the number of bytes read
|
||||
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
|
||||
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
|
||||
|
||||
// Save/load session file
|
||||
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
#include <cassert>
|
||||
#include "llama.h"
|
||||
|
||||
#ifdef NDEBUG
|
||||
#undef NDEBUG
|
||||
#endif
|
||||
|
||||
#include <cmath>
|
||||
#include <numeric>
|
||||
#include <cassert>
|
||||
@@ -8,7 +12,6 @@
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
|
||||
void dump(const llama_token_data_array * candidates) {
|
||||
for (size_t i = 0; i < candidates->size; i++) {
|
||||
printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
|
||||
|
||||
Reference in New Issue
Block a user