mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
49 Commits
master-c2d
...
master-925
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9254920265 | ||
|
|
e32089b2c2 | ||
|
|
2347e45e7b | ||
|
|
74d4cfa343 | ||
|
|
74a6d922f1 | ||
|
|
e4caa8da59 | ||
|
|
58970a4c39 | ||
|
|
8c0a10e64d | ||
|
|
fa84c4b3e8 | ||
|
|
12b063f0ec | ||
|
|
31d2b5f4a4 | ||
|
|
4de0334f5c | ||
|
|
3f1223155a | ||
|
|
303f5809f1 | ||
|
|
059e99066d | ||
|
|
17c10acfb4 | ||
|
|
e9b66ee982 | ||
|
|
4f0154b0ba | ||
|
|
ef3171d162 | ||
|
|
555275a693 | ||
|
|
98ed165574 | ||
|
|
ae9663f188 | ||
|
|
b33dee282f | ||
|
|
92f44ff7f7 | ||
|
|
245fc3c37d | ||
|
|
72ff5282bf | ||
|
|
0bf7cf1b29 | ||
|
|
8432d4d9f7 | ||
|
|
0f291e1f65 | ||
|
|
8fc8179919 | ||
|
|
b50b570ed9 | ||
|
|
53aba3f393 | ||
|
|
4161bdc04d | ||
|
|
0035858273 | ||
|
|
5c64a0952e | ||
|
|
5b57a5b726 | ||
|
|
4dc62c545d | ||
|
|
35a84916fb | ||
|
|
2d7bf110ed | ||
|
|
2a4e41a086 | ||
|
|
17366df842 | ||
|
|
44f906e853 | ||
|
|
d5b111f53d | ||
|
|
2d43387daf | ||
|
|
7ad7750c5c | ||
|
|
7a74dee6b4 | ||
|
|
590250f7a9 | ||
|
|
f4c55d3bd7 | ||
|
|
f1465624c2 |
@@ -16,4 +16,6 @@ COPY . .
|
||||
|
||||
RUN make
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
||||
|
||||
@@ -15,4 +15,6 @@ FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
|
||||
COPY --from=build /app/main /main
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/main" ]
|
||||
|
||||
4
.github/workflows/build.yml
vendored
4
.github/workflows/build.yml
vendored
@@ -10,10 +10,10 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
|
||||
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -7,6 +7,7 @@
|
||||
.envrc
|
||||
.swiftpm
|
||||
.venv
|
||||
.clang-tidy
|
||||
.vs/
|
||||
.vscode/
|
||||
|
||||
@@ -34,6 +35,7 @@ models/*
|
||||
/benchmark-matmult
|
||||
/vdot
|
||||
/Pipfile
|
||||
/libllama.so
|
||||
|
||||
build-info.h
|
||||
arm_neon.h
|
||||
|
||||
@@ -72,6 +72,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
|
||||
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
option(LLAMA_METAL "llama: use Metal" OFF)
|
||||
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
||||
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
@@ -226,6 +227,11 @@ if (LLAMA_METAL)
|
||||
)
|
||||
endif()
|
||||
|
||||
if (LLAMA_K_QUANTS)
|
||||
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
|
||||
add_compile_definitions(GGML_USE_K_QUANTS)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CLBLAST)
|
||||
find_package(CLBlast)
|
||||
if (CLBlast_FOUND)
|
||||
@@ -396,11 +402,10 @@ endif()
|
||||
add_library(ggml OBJECT
|
||||
ggml.c
|
||||
ggml.h
|
||||
ggml-quants-k.h
|
||||
ggml-quants-k.c
|
||||
${GGML_SOURCES_CUDA}
|
||||
${GGML_SOURCES_OPENCL}
|
||||
${GGML_SOURCES_METAL}
|
||||
${GGML_SOURCES_EXTRA}
|
||||
)
|
||||
|
||||
target_include_directories(ggml PUBLIC .)
|
||||
@@ -427,6 +432,9 @@ target_link_libraries(llama PRIVATE
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
||||
if (LLAMA_METAL)
|
||||
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_SOURCES_CUDA)
|
||||
|
||||
36
Makefile
36
Makefile
@@ -107,6 +107,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
|
||||
# Usage AVX-only
|
||||
#CFLAGS += -mfma -mf16c -mavx
|
||||
#CXXFLAGS += -mfma -mf16c -mavx
|
||||
|
||||
# Usage SSSE3-only (Not is SSE3!)
|
||||
#CFLAGS += -mssse3
|
||||
#CXXFLAGS += -mssse3
|
||||
endif
|
||||
|
||||
ifneq ($(filter ppc64%,$(UNAME_M)),)
|
||||
@@ -121,6 +125,12 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifndef LLAMA_NO_K_QUANTS
|
||||
CFLAGS += -DGGML_USE_K_QUANTS
|
||||
CXXFLAGS += -DGGML_USE_K_QUANTS
|
||||
OBJS += k_quants.o
|
||||
endif
|
||||
|
||||
ifndef LLAMA_NO_ACCELERATE
|
||||
# Mac M1 - include Accelerate framework.
|
||||
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
|
||||
@@ -140,7 +150,7 @@ ifdef LLAMA_OPENBLAS
|
||||
endif # LLAMA_OPENBLAS
|
||||
|
||||
ifdef LLAMA_BLIS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||
LDFLAGS += -lblis -L/usr/local/lib
|
||||
endif # LLAMA_BLIS
|
||||
|
||||
@@ -212,6 +222,11 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
|
||||
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
|
||||
endif
|
||||
|
||||
ifdef LLAMA_NO_K_QUANTS
|
||||
k_quants.o: k_quants.c k_quants.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_NO_K_QUANTS
|
||||
|
||||
#
|
||||
# Print build information
|
||||
#
|
||||
@@ -231,10 +246,7 @@ $(info )
|
||||
# Build library
|
||||
#
|
||||
|
||||
ggml.o: ggml.c ggml.h ggml-cuda.h ggml-quants-k.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ggml-quants-k.o: ggml-quants-k.c ggml-quants-k.h ggml.h ggml-cuda.h
|
||||
ggml.o: ggml.c ggml.h ggml-cuda.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
|
||||
@@ -253,25 +265,25 @@ clean:
|
||||
# Examples
|
||||
#
|
||||
|
||||
main: examples/main/main.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
||||
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
@echo
|
||||
@echo '==== Run ./main -h for help. ===='
|
||||
@echo
|
||||
|
||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o ggml-quants-k.o $(OBJS)
|
||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o ggml-quants-k.o $(OBJS)
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o ggml-quants-k.o $(OBJS)
|
||||
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o ggml-quants-k.o $(OBJS)
|
||||
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o ggml-quants-k.o $(OBJS)
|
||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
@@ -293,7 +305,7 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
./$@
|
||||
|
||||
vdot: pocs/vdot/vdot.cpp ggml.o ggml-quants-k.o $(OBJS)
|
||||
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
.PHONY: tests clean
|
||||
|
||||
18
README.md
18
README.md
@@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
**Hot topics:**
|
||||
|
||||
- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
|
||||
- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642
|
||||
- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684
|
||||
- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607
|
||||
@@ -267,11 +268,11 @@ Any value larger than 0 will offload the computation to the GPU. For example:
|
||||
|
||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
|
||||
|
||||
- **Accelerate Framework**:
|
||||
- #### Accelerate Framework:
|
||||
|
||||
This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
|
||||
|
||||
- **OpenBLAS**:
|
||||
- #### OpenBLAS:
|
||||
|
||||
This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
|
||||
|
||||
@@ -305,11 +306,11 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
- **BLIS**
|
||||
- #### BLIS
|
||||
|
||||
Check [BLIS.md](BLIS.md) for more information.
|
||||
Check [BLIS.md](docs/BLIS.md) for more information.
|
||||
|
||||
- **Intel MKL**
|
||||
- #### Intel MKL
|
||||
|
||||
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
|
||||
|
||||
@@ -317,10 +318,10 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake --build . -config Release
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
- **cuBLAS**
|
||||
- #### cuBLAS
|
||||
|
||||
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
|
||||
- Using `make`:
|
||||
@@ -339,7 +340,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|
||||
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
|
||||
|
||||
- **CLBlast**
|
||||
- #### CLBlast
|
||||
|
||||
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
|
||||
|
||||
@@ -684,3 +685,4 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /mode
|
||||
### Docs
|
||||
|
||||
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
||||
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
|
||||
666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_0.bin
|
||||
ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf models/7B/ggml-model-q4_0.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_1.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_0.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_1.bin
|
||||
@@ -8,7 +8,7 @@ ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml
|
||||
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
|
||||
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
|
||||
2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_0.bin
|
||||
fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5 models/13B/ggml-model-q4_0.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_1.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_0.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_1.bin
|
||||
@@ -18,7 +18,7 @@ e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/con
|
||||
24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth
|
||||
1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth
|
||||
7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_0.bin
|
||||
d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d models/30B/ggml-model-q4_0.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_1.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_0.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_1.bin
|
||||
@@ -32,7 +32,7 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/con
|
||||
72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth
|
||||
d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth
|
||||
60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_0.bin
|
||||
cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92 models/65B/ggml-model-q4_0.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_1.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_0.bin
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_1.bin
|
||||
|
||||
40
docs/token_generation_performance_tips.md
Normal file
40
docs/token_generation_performance_tips.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# Token generation performance troubleshooting
|
||||
|
||||
## Verifying that the model is running on the GPU with cuBLAS
|
||||
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
||||
```shell
|
||||
./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
|
||||
```
|
||||
|
||||
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
||||
```shell
|
||||
llama_model_load_internal: [cublas] offloading 60 layers to GPU
|
||||
llama_model_load_internal: [cublas] offloading output layer to GPU
|
||||
llama_model_load_internal: [cublas] total VRAM used: 17223 MB
|
||||
... rest of inference
|
||||
```
|
||||
|
||||
If you see these lines, then the GPU is being used.
|
||||
|
||||
## Verifying that the CPU is not oversaturated
|
||||
llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physicial CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
|
||||
|
||||
# Example of runtime flags effect on inference speed benchmark
|
||||
These runs were tested on the following machine:
|
||||
GPU: A6000 (48GB VRAM)
|
||||
CPU: 7 physical cores
|
||||
RAM: 32GB
|
||||
|
||||
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)
|
||||
|
||||
Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
||||
|
||||
Result:
|
||||
|
||||
| command | tokens/second (higher is better) |
|
||||
| - | - |
|
||||
| -ngl 2000000 | N/A (less than 0.1) |
|
||||
| -t 7 | 1.7 |
|
||||
| -t 1 -ngl 2000000 | 5.5 |
|
||||
| -t 7 -ngl 2000000 | 8.7 |
|
||||
| -t 4 -ngl 2000000 | 9.1 |
|
||||
@@ -37,6 +37,7 @@ else()
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(benchmark)
|
||||
add_subdirectory(baby-llama)
|
||||
add_subdirectory(train-text-from-scratch)
|
||||
if (LLAMA_METAL)
|
||||
add_subdirectory(metal)
|
||||
endif()
|
||||
|
||||
@@ -79,34 +79,39 @@ struct ggml_tensor * randomize_tensor_normal(
|
||||
int ndims,
|
||||
const int64_t ne[],
|
||||
struct random_normal_distribution * rnd) {
|
||||
float scale = 1.0; // xavier
|
||||
switch (ndims) {
|
||||
case 1:
|
||||
scale /= sqrtf(ne[0]);
|
||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
||||
((float *)tensor->data)[i0] = frand_normal(rnd);
|
||||
((float *)tensor->data)[i0] = scale * frand_normal(rnd);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
scale /= sqrtf(ne[0]+ne[1]);
|
||||
for (int i1 = 0; i1 < ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
||||
((float *)tensor->data)[i1*ne[0] + i0] = frand_normal(rnd);
|
||||
((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
scale /= sqrtf(ne[0]+ne[1]);
|
||||
for (int i2 = 0; i2 < ne[2]; i2++) {
|
||||
for (int i1 = 0; i1 < ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
||||
((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd);
|
||||
((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
scale /= sqrtf(ne[0]+ne[1]);
|
||||
for (int i3 = 0; i3 < ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < ne[2]; i2++) {
|
||||
for (int i1 = 0; i1 < ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
||||
((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd);
|
||||
((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -148,8 +153,8 @@ struct llama_hparams_lora {
|
||||
uint32_t n_rot = 64;
|
||||
uint32_t n_lora = 64;
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
return memcmp(this, &other, sizeof(llama_hparams));
|
||||
bool operator!=(const llama_hparams_lora & other) const {
|
||||
return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
#include <unordered_set>
|
||||
#include <regex>
|
||||
|
||||
#if defined(__APPLE__) && defined(__MACH__)
|
||||
#include <sys/types.h>
|
||||
@@ -131,6 +132,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
params.path_prompt_cache = argv[i];
|
||||
} else if (arg == "--prompt-cache-all") {
|
||||
params.prompt_cache_all = true;
|
||||
} else if (arg == "--prompt-cache-ro") {
|
||||
params.prompt_cache_ro = true;
|
||||
} else if (arg == "-f" || arg == "--file") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -295,6 +298,40 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
} else if (arg == "--main-gpu" || arg == "-mg") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
params.main_gpu = std::stoi(argv[i]);
|
||||
#else
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
|
||||
#endif
|
||||
} else if (arg == "--tensor-split" || arg == "-ts") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
std::string arg_next = argv[i];
|
||||
|
||||
// split string by , and /
|
||||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||
|
||||
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
|
||||
if (i < split_arg.size()) {
|
||||
params.tensor_split[i] = std::stof(split_arg[i]);
|
||||
} else {
|
||||
params.tensor_split[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
#else
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
} else if (arg == "--no-mmap") {
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--mtest") {
|
||||
@@ -397,6 +434,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
|
||||
fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
|
||||
fprintf(stderr, " not supported with --interactive or other interactive options\n");
|
||||
fprintf(stderr, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
|
||||
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
|
||||
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
|
||||
fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
|
||||
@@ -438,6 +476,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
||||
fprintf(stderr, " number of layers to store in VRAM\n");
|
||||
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
|
||||
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
|
||||
#endif
|
||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
||||
@@ -483,7 +524,10 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
lparams.n_batch = params.n_batch;
|
||||
lparams.n_gpu_layers = params.n_gpu_layers;
|
||||
lparams.main_gpu = params.main_gpu;
|
||||
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
|
||||
lparams.seed = params.seed;
|
||||
lparams.f16_kv = params.memory_f16;
|
||||
lparams.use_mmap = params.use_mmap;
|
||||
@@ -588,6 +632,9 @@ void console_set_color(console_state & con_st, console_color_t color) {
|
||||
case CONSOLE_COLOR_USER_INPUT:
|
||||
fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
|
||||
break;
|
||||
case CONSOLE_COLOR_ERROR:
|
||||
fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED);
|
||||
break;
|
||||
}
|
||||
con_st.color = color;
|
||||
fflush(con_st.out);
|
||||
|
||||
@@ -21,13 +21,15 @@
|
||||
int32_t get_num_physical_cores();
|
||||
|
||||
struct gpt_params {
|
||||
int32_t seed = -1; // RNG seed
|
||||
int32_t n_threads = get_num_physical_cores();
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
||||
int32_t seed = -1; // RNG seed
|
||||
int32_t n_threads = get_num_physical_cores();
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
|
||||
// sampling parameters
|
||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||
@@ -60,6 +62,7 @@ struct gpt_params {
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool interactive = false; // interactive mode
|
||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||
|
||||
bool embedding = false; // get only sentence embedding
|
||||
bool interactive_first = false; // wait for user input immediately
|
||||
@@ -109,7 +112,8 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
|
||||
enum console_color_t {
|
||||
CONSOLE_COLOR_DEFAULT=0,
|
||||
CONSOLE_COLOR_PROMPT,
|
||||
CONSOLE_COLOR_USER_INPUT
|
||||
CONSOLE_COLOR_USER_INPUT,
|
||||
CONSOLE_COLOR_ERROR
|
||||
};
|
||||
|
||||
struct console_state {
|
||||
|
||||
@@ -286,5 +286,7 @@ These options provide extra functionality and customization when running the LLa
|
||||
- `--verbose-prompt`: Print the prompt before generating text.
|
||||
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
|
||||
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
|
||||
@@ -81,6 +81,9 @@ int main(int argc, char ** argv) {
|
||||
if (params.n_ctx > 2048) {
|
||||
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
||||
"expect poor results\n", __func__, params.n_ctx);
|
||||
} else if (params.n_ctx < 8) {
|
||||
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
@@ -328,9 +331,29 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
|
||||
// do one empty run to warm up the model
|
||||
{
|
||||
const std::vector<llama_token> tmp = { llama_token_bos(), };
|
||||
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||
llama_reset_timings(ctx);
|
||||
}
|
||||
|
||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||
// predict
|
||||
if (embd.size() > 0) {
|
||||
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
|
||||
// --prompt or --file which uses the same value.
|
||||
auto max_embd_size = n_ctx - 4;
|
||||
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
|
||||
if ((int)embd.size() > max_embd_size) {
|
||||
auto skipped_tokens = embd.size() - max_embd_size;
|
||||
console_set_color(con_st, CONSOLE_COLOR_ERROR);
|
||||
printf("<<input too long: skipped %ld token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
|
||||
fflush(stdout);
|
||||
embd.resize(max_embd_size);
|
||||
}
|
||||
|
||||
// infinite text generation via context swapping
|
||||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
@@ -417,7 +440,7 @@ int main(int argc, char ** argv) {
|
||||
const bool penalize_nl = params.penalize_nl;
|
||||
|
||||
// optionally save the session on first sample (for faster prompt loading next time)
|
||||
if (!path_session.empty() && need_to_save_session) {
|
||||
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
|
||||
need_to_save_session = false;
|
||||
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
}
|
||||
@@ -630,7 +653,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
if (!path_session.empty() && params.prompt_cache_all) {
|
||||
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
||||
fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
||||
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
}
|
||||
|
||||
@@ -3,43 +3,136 @@
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <map>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
|
||||
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
|
||||
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
|
||||
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
|
||||
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
|
||||
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
|
||||
{"q2_K", LLAMA_FTYPE_MOSTLY_Q2_K},
|
||||
{"q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M},
|
||||
{"q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S},
|
||||
{"q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M},
|
||||
{"q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L},
|
||||
{"q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M},
|
||||
{"q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S},
|
||||
{"q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M},
|
||||
{"q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M},
|
||||
{"q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S},
|
||||
{"q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M},
|
||||
{"q6_K", LLAMA_FTYPE_MOSTLY_Q6_K},
|
||||
struct quant_option {
|
||||
std::string name;
|
||||
llama_ftype ftype;
|
||||
std::string desc;
|
||||
};
|
||||
|
||||
bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||
auto it = LLAMA_FTYPE_MAP.find(ftype_str);
|
||||
if (it != LLAMA_FTYPE_MAP.end()) {
|
||||
ftype = it->second;
|
||||
ftype_str_out = it->first;
|
||||
return true;
|
||||
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{
|
||||
"Q4_0",
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0,
|
||||
" 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M",
|
||||
},
|
||||
{
|
||||
"Q4_1",
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1,
|
||||
" 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
|
||||
},
|
||||
{
|
||||
"Q5_0",
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0,
|
||||
" 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
|
||||
},
|
||||
{
|
||||
"Q5_1",
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||
" 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
|
||||
},
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
{
|
||||
"Q2_K",
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K,
|
||||
" 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended",
|
||||
},
|
||||
{
|
||||
"Q3_K",
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_M,
|
||||
"alias for Q3_K_M"
|
||||
},
|
||||
{
|
||||
"Q3_K_S",
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_S,
|
||||
" 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
|
||||
},
|
||||
{
|
||||
"Q3_K_M",
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_M,
|
||||
" 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
|
||||
},
|
||||
{
|
||||
"Q3_K_L",
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_L,
|
||||
" 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
|
||||
},
|
||||
{
|
||||
"Q4_K",
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_M,
|
||||
"alias for Q4_K_M",
|
||||
},
|
||||
{
|
||||
"Q4_K_S",
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_S,
|
||||
" 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
|
||||
},
|
||||
{
|
||||
"Q4_K_M",
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_M,
|
||||
" 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
|
||||
},
|
||||
{
|
||||
"Q5_K",
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_M,
|
||||
"alias for Q5_K_M",
|
||||
},
|
||||
{
|
||||
"Q5_K_S",
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_S,
|
||||
" 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
|
||||
},
|
||||
{
|
||||
"Q5_K_M",
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_M,
|
||||
" 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
|
||||
},
|
||||
{
|
||||
"Q6_K",
|
||||
LLAMA_FTYPE_MOSTLY_Q6_K,
|
||||
" 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
|
||||
},
|
||||
#endif
|
||||
{
|
||||
"Q8_0",
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0,
|
||||
" 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
|
||||
},
|
||||
{
|
||||
"F16",
|
||||
LLAMA_FTYPE_MOSTLY_F16,
|
||||
"13.00G @ 7B - extremely large, virtually no quality loss - not recommended",
|
||||
},
|
||||
{
|
||||
"F32",
|
||||
LLAMA_FTYPE_ALL_F32,
|
||||
"26.00G @ 7B - absolutely huge, lossless - not recommended",
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||
std::string ftype_str;
|
||||
|
||||
for (auto ch : ftype_str_in) {
|
||||
ftype_str.push_back(std::toupper(ch));
|
||||
}
|
||||
for (auto & it : QUANT_OPTIONS) {
|
||||
if (it.name == ftype_str) {
|
||||
ftype = it.ftype;
|
||||
ftype_str_out = it.name;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// try to parse as an integer
|
||||
try {
|
||||
int ftype_int = std::stoi(ftype_str);
|
||||
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
|
||||
if (it->second == ftype_int) {
|
||||
ftype = it->second;
|
||||
ftype_str_out = it->first;
|
||||
for (auto & it : QUANT_OPTIONS) {
|
||||
if (it.ftype == ftype_int) {
|
||||
ftype = it.ftype;
|
||||
ftype_str_out = it.name;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -51,29 +144,51 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st
|
||||
}
|
||||
|
||||
// usage:
|
||||
// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
|
||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
|
||||
//
|
||||
void usage(const char * executable) {
|
||||
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
|
||||
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
fprintf(stderr, "\nAllowed quantization types:\n");
|
||||
for (auto & it : QUANT_OPTIONS) {
|
||||
printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
|
||||
}
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
|
||||
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
|
||||
fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
||||
llama_model_quantize_params params = llama_model_quantize_default_params();
|
||||
|
||||
int arg_idx = 1;
|
||||
|
||||
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
||||
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
|
||||
params.quantize_output_tensor = false;
|
||||
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
||||
params.allow_requantize = true;
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (argc - arg_idx < 3) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
||||
llama_init_backend();
|
||||
|
||||
// parse command line arguments
|
||||
const std::string fname_inp = argv[1];
|
||||
const std::string fname_inp = argv[arg_idx];
|
||||
arg_idx++;
|
||||
std::string fname_out;
|
||||
int nthread;
|
||||
llama_ftype ftype;
|
||||
|
||||
int arg_idx = 2;
|
||||
std::string ftype_str;
|
||||
if (try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
|
||||
// argv[2] is the ftype
|
||||
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||
std::string fpath;
|
||||
const size_t pos = fname_inp.find_last_of('/');
|
||||
if (pos != std::string::npos) {
|
||||
@@ -84,7 +199,6 @@ int main(int argc, char ** argv) {
|
||||
arg_idx++;
|
||||
}
|
||||
else {
|
||||
// argv[2] is the output path
|
||||
fname_out = argv[arg_idx];
|
||||
arg_idx++;
|
||||
|
||||
@@ -92,8 +206,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s: missing ftype\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
// argv[3] is the ftype
|
||||
if (!try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
|
||||
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
|
||||
return 1;
|
||||
}
|
||||
@@ -103,21 +216,19 @@ int main(int argc, char ** argv) {
|
||||
// parse nthreads
|
||||
if (argc > arg_idx) {
|
||||
try {
|
||||
nthread = std::stoi(argv[arg_idx]);
|
||||
params.nthread = std::stoi(argv[arg_idx]);
|
||||
}
|
||||
catch (const std::exception & e) {
|
||||
fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
nthread = 0;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
|
||||
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
|
||||
if (nthread > 0) {
|
||||
fprintf(stderr, " using %d threads", nthread);
|
||||
if (params.nthread > 0) {
|
||||
fprintf(stderr, " using %d threads", params.nthread);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
@@ -129,7 +240,7 @@ int main(int argc, char ** argv) {
|
||||
{
|
||||
const int64_t t_start_us = llama_time_us();
|
||||
|
||||
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
|
||||
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ¶ms)) {
|
||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -287,6 +287,8 @@ Test();
|
||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
||||
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||
- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
|
||||
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
|
||||
- `--port`: Set the port to listen. Default: `8080`.
|
||||
|
||||
@@ -401,6 +401,10 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms)
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
||||
fprintf(stderr, " number of layers to store in VRAM\n");
|
||||
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
|
||||
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
|
||||
#endif
|
||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||
@@ -502,6 +506,50 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
|
||||
#else
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
}
|
||||
else if (arg == "--tensor-split" || arg == "-ts")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
std::string arg_next = argv[i];
|
||||
|
||||
// split string by , and /
|
||||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||
|
||||
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i)
|
||||
{
|
||||
if (i < split_arg.size())
|
||||
{
|
||||
params.tensor_split[i] = std::stof(split_arg[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
params.tensor_split[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
#else
|
||||
fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
}
|
||||
else if (arg == "--main-gpu" || arg == "-mg")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
params.main_gpu = std::stoi(argv[i]);
|
||||
#else
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
|
||||
#endif
|
||||
}
|
||||
else
|
||||
|
||||
4
examples/train-text-from-scratch/CMakeLists.txt
Normal file
4
examples/train-text-from-scratch/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
set(TARGET train-text-from-scratch)
|
||||
add_executable(${TARGET} train-text-from-scratch.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
22
examples/train-text-from-scratch/README.md
Normal file
22
examples/train-text-from-scratch/README.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# train-text-from-scratch
|
||||
|
||||
Basic usage instructions:
|
||||
|
||||
```bash
|
||||
# get training data
|
||||
wget https://github.com/brunoklein99/deep-learning-notes/blob/master/shakespeare.txt
|
||||
|
||||
# train
|
||||
./bin/train-text-from-scratch \
|
||||
--vocab-model ../models/ggml-vocab.bin \
|
||||
--ctx 64 --embd 256 --head 8 --layer 16 \
|
||||
--checkpoint-in chk-shakespeare-256x16.bin \
|
||||
--checkpoint-out chk-shakespeare-256x16.bin \
|
||||
--model-out ggml-shakespeare-256x16-f32.bin \
|
||||
--train-data "shakespeare.txt" \
|
||||
-t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
|
||||
--print-details-interval 0 --predict 16 --use-flash
|
||||
|
||||
# predict
|
||||
./bin/main -m ggml-shakespeare-256x16-f32.bin
|
||||
```
|
||||
3399
examples/train-text-from-scratch/train-text-from-scratch.cpp
Normal file
3399
examples/train-text-from-scratch/train-text-from-scratch.cpp
Normal file
File diff suppressed because it is too large
Load Diff
30
flake.lock
generated
30
flake.lock
generated
@@ -1,12 +1,15 @@
|
||||
{
|
||||
"nodes": {
|
||||
"flake-utils": {
|
||||
"inputs": {
|
||||
"systems": "systems"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1676283394,
|
||||
"narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=",
|
||||
"lastModified": 1685518550,
|
||||
"narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073",
|
||||
"rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -17,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1678470307,
|
||||
"narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=",
|
||||
"lastModified": 1685931219,
|
||||
"narHash": "sha256-8EWeOZ6LKQfgAjB/USffUSELPRjw88A+xTcXnOUvO5M=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f",
|
||||
"rev": "7409480d5c8584a1a83c422530419efe4afb0d19",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -36,6 +39,21 @@
|
||||
"flake-utils": "flake-utils",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
|
||||
26
flake.nix
26
flake.nix
@@ -6,6 +6,13 @@
|
||||
outputs = { self, nixpkgs, flake-utils }:
|
||||
flake-utils.lib.eachDefaultSystem (system:
|
||||
let
|
||||
inherit (pkgs.stdenv) isAarch64 isDarwin;
|
||||
inherit (pkgs.lib) optionals;
|
||||
isM1 = isAarch64 && isDarwin;
|
||||
osSpecific =
|
||||
if isM1 then with pkgs.darwin.apple_sdk_11_0.frameworks; [ Accelerate MetalKit MetalPerformanceShaders MetalPerformanceShadersGraph ]
|
||||
else if isDarwin then with pkgs.darwin.apple_sdk.frameworks; [ Accelerate CoreGraphics CoreVideo ]
|
||||
else [ ];
|
||||
pkgs = import nixpkgs {
|
||||
inherit system;
|
||||
};
|
||||
@@ -18,17 +25,22 @@
|
||||
packages.default = pkgs.stdenv.mkDerivation {
|
||||
name = "llama.cpp";
|
||||
src = ./.;
|
||||
postPatch =
|
||||
if isM1 then ''
|
||||
substituteInPlace ./ggml-metal.m \
|
||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";"
|
||||
'' else "";
|
||||
nativeBuildInputs = with pkgs; [ cmake ];
|
||||
buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
|
||||
darwin.apple_sdk.frameworks.Accelerate
|
||||
];
|
||||
cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
|
||||
buildInputs = osSpecific;
|
||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
|
||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||
];
|
||||
"-DLLAMA_METAL=ON"
|
||||
]);
|
||||
installPhase = ''
|
||||
mkdir -p $out/bin
|
||||
mv bin/* $out/bin/
|
||||
mv $out/bin/main $out/bin/llama
|
||||
mv $out/bin/server $out/bin/llama-server
|
||||
|
||||
echo "#!${llama-python}/bin/python" > $out/bin/convert.py
|
||||
cat ${./convert.py} >> $out/bin/convert.py
|
||||
@@ -40,9 +52,7 @@
|
||||
packages = with pkgs; [
|
||||
cmake
|
||||
llama-python
|
||||
] ++ lib.optionals stdenv.isDarwin [
|
||||
darwin.apple_sdk.frameworks.Accelerate
|
||||
];
|
||||
] ++ osSpecific;
|
||||
};
|
||||
}
|
||||
);
|
||||
|
||||
1426
ggml-cuda.cu
1426
ggml-cuda.cu
File diff suppressed because it is too large
Load Diff
18
ggml-cuda.h
18
ggml-cuda.h
@@ -1,10 +1,19 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_CUDA_MAX_DEVICES 16
|
||||
|
||||
struct ggml_tensor_extra_gpu {
|
||||
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
||||
};
|
||||
|
||||
void ggml_init_cublas(void);
|
||||
void ggml_cuda_set_tensor_split(const float * tensor_split);
|
||||
|
||||
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
@@ -15,8 +24,13 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
||||
void * ggml_cuda_host_malloc(size_t size);
|
||||
void ggml_cuda_host_free(void * ptr);
|
||||
|
||||
void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
|
||||
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
|
||||
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||
|
||||
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
||||
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
||||
void ggml_cuda_set_main_device(int main_device);
|
||||
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
150
ggml-metal.m
150
ggml-metal.m
@@ -45,12 +45,26 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(scale);
|
||||
GGML_METAL_DECL_KERNEL(silu);
|
||||
GGML_METAL_DECL_KERNEL(relu);
|
||||
GGML_METAL_DECL_KERNEL(gelu);
|
||||
GGML_METAL_DECL_KERNEL(soft_max);
|
||||
GGML_METAL_DECL_KERNEL(diag_mask_inf);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q2_k);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q3_k);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q4_k);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q5_k);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q6_k);
|
||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q3_k_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
|
||||
GGML_METAL_DECL_KERNEL(rope);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
||||
@@ -63,6 +77,12 @@ struct ggml_metal_context {
|
||||
// for now it is easier to work in a separate file
|
||||
static NSString * const msl_library_source = @"see metal.metal";
|
||||
|
||||
// Here to assist with NSBundle Path Hack
|
||||
@interface GGMLMetalClass : NSObject
|
||||
@end
|
||||
@implementation GGMLMetalClass
|
||||
@end
|
||||
|
||||
struct ggml_metal_context * ggml_metal_init(void) {
|
||||
fprintf(stderr, "%s: allocating\n", __func__);
|
||||
|
||||
@@ -70,6 +90,7 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
||||
|
||||
ctx->device = MTLCreateSystemDefaultDevice();
|
||||
ctx->queue = [ctx->device newCommandQueue];
|
||||
ctx->n_buffers = 0;
|
||||
|
||||
// determine if we can use MPS
|
||||
if (MPSSupportsMTLDevice(ctx->device)) {
|
||||
@@ -98,7 +119,8 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
||||
NSError * error = nil;
|
||||
|
||||
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
|
||||
NSString * path = [[NSBundle mainBundle] pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]);
|
||||
|
||||
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
||||
@@ -128,12 +150,26 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
||||
GGML_METAL_ADD_KERNEL(scale);
|
||||
GGML_METAL_ADD_KERNEL(silu);
|
||||
GGML_METAL_ADD_KERNEL(relu);
|
||||
GGML_METAL_ADD_KERNEL(gelu);
|
||||
GGML_METAL_ADD_KERNEL(soft_max);
|
||||
GGML_METAL_ADD_KERNEL(diag_mask_inf);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q2_k);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q3_k);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q4_k);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q5_k);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q6_k);
|
||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q3_k_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
|
||||
GGML_METAL_ADD_KERNEL(rope);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
||||
@@ -204,6 +240,11 @@ bool ggml_metal_add_buffer(
|
||||
ctx->buffers[ctx->n_buffers].name = name;
|
||||
ctx->buffers[ctx->n_buffers].data = data;
|
||||
ctx->buffers[ctx->n_buffers].size = size;
|
||||
|
||||
if (ctx->device.maxBufferLength < aligned_size) {
|
||||
fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength);
|
||||
return false;
|
||||
}
|
||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
|
||||
|
||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||
@@ -401,6 +442,20 @@ void ggml_metal_graph_compute(
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_GELU:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_gelu];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
@@ -493,24 +548,82 @@ void ggml_metal_graph_compute(
|
||||
|
||||
// use custom matrix x vector kernel
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
GGML_ASSERT(ne02 == ne12);
|
||||
|
||||
nth0 = 64;
|
||||
nth1 = 1;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 8;
|
||||
nth1 = 4;
|
||||
nth1 = 8;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_1:
|
||||
{
|
||||
GGML_ASSERT(ne02 == ne12);
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 32;
|
||||
nth1 = 1;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
|
||||
} break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
case GGML_TYPE_Q2_K:
|
||||
{
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
{
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
{
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
{
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
{
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "Asserting on type %d\n",(int)src0t);
|
||||
GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -530,9 +643,17 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0) {
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
|
||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q2_K ||
|
||||
src0t == GGML_TYPE_Q3_K ||
|
||||
src0t == GGML_TYPE_Q4_K ||
|
||||
src0t == GGML_TYPE_Q5_K ||
|
||||
src0t == GGML_TYPE_Q6_K) {
|
||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
} else {
|
||||
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
@@ -546,7 +667,14 @@ void ggml_metal_graph_compute(
|
||||
}
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
||||
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
|
||||
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
|
||||
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
|
||||
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
|
||||
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
|
||||
|
||||
1011
ggml-metal.metal
1011
ggml-metal.metal
File diff suppressed because it is too large
Load Diff
129
ggml-opencl.cpp
129
ggml-opencl.cpp
@@ -4,6 +4,7 @@
|
||||
#include <atomic>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
|
||||
#define CL_TARGET_OPENCL_VERSION 110
|
||||
#include <clblast.h>
|
||||
@@ -604,21 +605,44 @@ struct cl_buffer {
|
||||
static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
|
||||
static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
|
||||
|
||||
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) {
|
||||
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
|
||||
scoped_spin_lock lock(g_cl_pool_lock);
|
||||
cl_int err;
|
||||
|
||||
int best_i = -1;
|
||||
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
|
||||
int worst_i = -1;
|
||||
size_t worst_size = 0; //largest unused buffer seen so far
|
||||
for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
|
||||
cl_buffer& b = g_cl_buffer_pool[i];
|
||||
if (b.size > 0 && b.size >= size) {
|
||||
cl_mem mem = b.mem;
|
||||
*actual_size = b.size;
|
||||
b.size = 0;
|
||||
return mem;
|
||||
cl_buffer &b = g_cl_buffer_pool[i];
|
||||
if (b.size > 0 && b.size >= size && b.size < best_size)
|
||||
{
|
||||
best_i = i;
|
||||
best_size = b.size;
|
||||
}
|
||||
if (b.size > 0 && b.size > worst_size)
|
||||
{
|
||||
worst_i = i;
|
||||
worst_size = b.size;
|
||||
}
|
||||
}
|
||||
if(best_i!=-1) //found the smallest buffer that fits our needs
|
||||
{
|
||||
cl_buffer& b = g_cl_buffer_pool[best_i];
|
||||
cl_mem mem = b.mem;
|
||||
*actual_size = b.size;
|
||||
b.size = 0;
|
||||
return mem;
|
||||
}
|
||||
if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
|
||||
{
|
||||
cl_buffer& b = g_cl_buffer_pool[worst_i];
|
||||
cl_mem mem = b.mem;
|
||||
b.size = 0;
|
||||
clReleaseMemObject(mem);
|
||||
}
|
||||
cl_mem mem;
|
||||
CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err));
|
||||
CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
|
||||
*actual_size = size;
|
||||
return mem;
|
||||
}
|
||||
@@ -638,6 +662,15 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
|
||||
clReleaseMemObject(mem);
|
||||
}
|
||||
|
||||
void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
||||
if (tensor->backend != GGML_BACKEND_GPU) {
|
||||
return;
|
||||
}
|
||||
|
||||
cl_mem mem = (cl_mem)tensor->data;
|
||||
clReleaseMemObject(mem);
|
||||
}
|
||||
|
||||
static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
|
||||
cl_int err;
|
||||
const uint64_t ne0 = src->ne[0];
|
||||
@@ -676,7 +709,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_CL);
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
@@ -692,9 +725,10 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
||||
size_t x_size;
|
||||
size_t d_size;
|
||||
|
||||
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size, CL_MEM_READ_ONLY); // src0
|
||||
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
||||
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
|
||||
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size, CL_MEM_WRITE_ONLY); // dst
|
||||
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
||||
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
@@ -789,18 +823,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
size_t y_size;
|
||||
size_t d_size;
|
||||
cl_mem d_X;
|
||||
if (src0->backend == GGML_BACKEND_CL) {
|
||||
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
||||
d_X = (cl_mem) src0->data;
|
||||
} else {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
||||
}
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
// copy data to device
|
||||
if (src0->backend != GGML_BACKEND_CL) {
|
||||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||
}
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
||||
@@ -829,7 +863,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
}
|
||||
}
|
||||
|
||||
if (src0->backend != GGML_BACKEND_CL) {
|
||||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
ggml_cl_pool_free(d_X, x_size);
|
||||
}
|
||||
ggml_cl_pool_free(d_Y, y_size);
|
||||
@@ -865,13 +899,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
size_t y_size;
|
||||
size_t d_size;
|
||||
cl_mem d_X;
|
||||
if (src0->backend == GGML_BACKEND_CL) {
|
||||
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
||||
d_X = (cl_mem) src0->data;
|
||||
} else {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
||||
}
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
|
||||
|
||||
bool src1_cont_rows = nb10 == sizeof(float);
|
||||
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
||||
@@ -879,7 +913,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
// copy src0 to device
|
||||
if (src0->backend != GGML_BACKEND_CL) {
|
||||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||
}
|
||||
|
||||
@@ -936,7 +970,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
}
|
||||
}
|
||||
|
||||
if (src0->backend != GGML_BACKEND_CL) {
|
||||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
ggml_cl_pool_free(d_X, x_size);
|
||||
}
|
||||
ggml_cl_pool_free(d_Y, y_size);
|
||||
@@ -970,13 +1004,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
size_t q_size;
|
||||
cl_mem d_X;
|
||||
if (!mul_mat_vec) {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
|
||||
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
||||
}
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
cl_mem d_Q;
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
||||
d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
|
||||
}
|
||||
|
||||
cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
|
||||
@@ -992,7 +1026,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
events.emplace_back();
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
||||
} else if (src0->backend == GGML_BACKEND_CL) {
|
||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||
d_Q = (cl_mem) src0->data;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
@@ -1077,7 +1111,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
||||
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||
src1->type == GGML_TYPE_F32 &&
|
||||
dst->type == GGML_TYPE_F32 &&
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1133,7 +1167,7 @@ size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct g
|
||||
return 0;
|
||||
}
|
||||
|
||||
void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
||||
void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
||||
const int64_t ne0 = tensor->ne[0];
|
||||
const int64_t ne1 = tensor->ne[1];
|
||||
const int64_t ne2 = tensor->ne[2];
|
||||
@@ -1143,8 +1177,9 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
||||
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
||||
|
||||
size_t q_size;
|
||||
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
||||
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
|
||||
|
||||
tensor->data = data;
|
||||
// copy tensor to device
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||
@@ -1156,35 +1191,5 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
tensor->data = dst;
|
||||
tensor->backend = GGML_BACKEND_CL;
|
||||
}
|
||||
|
||||
void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
|
||||
cl_int err;
|
||||
FILE * fp = fopen(fname, "rb");
|
||||
|
||||
const size_t size = ggml_nbytes(tensor);
|
||||
|
||||
cl_mem dst;
|
||||
CL_CHECK((dst = clCreateBuffer(context, CL_MEM_READ_ONLY, size, nullptr, &err), err));
|
||||
void * buf_host = malloc(size);
|
||||
|
||||
#ifdef _WIN32
|
||||
int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
|
||||
#else
|
||||
int ret = fseek(fp, (long) offset, SEEK_SET);
|
||||
#endif
|
||||
GGML_ASSERT(ret == 0); // same
|
||||
|
||||
size_t ret2 = fread(buf_host, size, 1, fp);
|
||||
if (ret2 != 1) {
|
||||
fprintf(stderr, "unexpectedly reached end of file");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
clEnqueueWriteBuffer(queue, dst, CL_TRUE, 0, size, buf_host, 0, nullptr, nullptr);
|
||||
|
||||
tensor->data = dst;
|
||||
free(buf_host);
|
||||
fclose(fp);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
}
|
||||
|
||||
@@ -16,8 +16,9 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor
|
||||
void * ggml_cl_host_malloc(size_t size);
|
||||
void ggml_cl_host_free(void * ptr);
|
||||
|
||||
void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
|
||||
void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, size_t offset);
|
||||
void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
||||
|
||||
void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
161
ggml.h
161
ggml.h
@@ -256,8 +256,8 @@ extern "C" {
|
||||
|
||||
enum ggml_backend {
|
||||
GGML_BACKEND_CPU = 0,
|
||||
GGML_BACKEND_CUDA = 1,
|
||||
GGML_BACKEND_CL = 2,
|
||||
GGML_BACKEND_GPU = 10,
|
||||
GGML_BACKEND_GPU_SPLIT = 20,
|
||||
};
|
||||
|
||||
// model file types
|
||||
@@ -296,6 +296,7 @@ extern "C" {
|
||||
GGML_OP_SUM_ROWS,
|
||||
GGML_OP_MEAN,
|
||||
GGML_OP_REPEAT,
|
||||
GGML_OP_REPEAT_BACK,
|
||||
GGML_OP_ABS,
|
||||
GGML_OP_SGN,
|
||||
GGML_OP_NEG,
|
||||
@@ -309,6 +310,7 @@ extern "C" {
|
||||
GGML_OP_RMS_NORM_BACK,
|
||||
|
||||
GGML_OP_MUL_MAT,
|
||||
GGML_OP_OUT_PROD,
|
||||
|
||||
GGML_OP_SCALE,
|
||||
GGML_OP_SET,
|
||||
@@ -324,6 +326,7 @@ extern "C" {
|
||||
GGML_OP_DIAG_MASK_INF,
|
||||
GGML_OP_DIAG_MASK_ZERO,
|
||||
GGML_OP_SOFT_MAX,
|
||||
GGML_OP_SOFT_MAX_BACK,
|
||||
GGML_OP_ROPE,
|
||||
GGML_OP_ROPE_BACK,
|
||||
GGML_OP_ALIBI,
|
||||
@@ -333,10 +336,14 @@ extern "C" {
|
||||
|
||||
GGML_OP_FLASH_ATTN,
|
||||
GGML_OP_FLASH_FF,
|
||||
GGML_OP_FLASH_ATTN_BACK,
|
||||
|
||||
GGML_OP_MAP_UNARY,
|
||||
GGML_OP_MAP_BINARY,
|
||||
|
||||
GGML_OP_CROSS_ENTROPY_LOSS,
|
||||
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
||||
|
||||
GGML_OP_COUNT,
|
||||
};
|
||||
|
||||
@@ -387,7 +394,9 @@ extern "C" {
|
||||
|
||||
char name[GGML_MAX_NAME];
|
||||
|
||||
char padding[16];
|
||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||
|
||||
char padding[4];
|
||||
};
|
||||
|
||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||
@@ -425,6 +434,25 @@ extern "C" {
|
||||
bool no_alloc; // don't allocate memory for the tensor data
|
||||
};
|
||||
|
||||
|
||||
// compute types
|
||||
enum ggml_task_type {
|
||||
GGML_TASK_INIT = 0,
|
||||
GGML_TASK_COMPUTE,
|
||||
GGML_TASK_FINALIZE,
|
||||
};
|
||||
|
||||
struct ggml_compute_params {
|
||||
enum ggml_task_type type;
|
||||
|
||||
// ith = thread index, nth = number of threads
|
||||
int ith, nth;
|
||||
|
||||
// work buffer for all threads
|
||||
size_t wsize;
|
||||
void * wdata;
|
||||
};
|
||||
|
||||
// misc
|
||||
|
||||
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
||||
@@ -436,9 +464,10 @@ extern "C" {
|
||||
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
||||
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
||||
|
||||
GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
||||
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
||||
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
||||
|
||||
GGML_API int ggml_blck_size (enum ggml_type type);
|
||||
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||
@@ -552,6 +581,11 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_add1_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_acc(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -623,6 +657,11 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_repeat_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_abs(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
@@ -676,14 +715,22 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// A: m rows, n columns
|
||||
// B: p rows, n columns (i.e. we transpose it internally)
|
||||
// A: n columns, m rows
|
||||
// B: n columns, p rows (i.e. we transpose it internally)
|
||||
// result is m columns, p rows
|
||||
GGML_API struct ggml_tensor * ggml_mul_mat(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// A: m columns, n rows,
|
||||
// B: p columns, n rows,
|
||||
// result is m columns, p rows
|
||||
GGML_API struct ggml_tensor * ggml_out_prod(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
//
|
||||
// operations on tensors without backpropagation
|
||||
//
|
||||
@@ -894,6 +941,17 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// rotary position embedding
|
||||
// if mode & 1 == 1, skip n_past elements
|
||||
// if mode & 2 == 1, GPT-NeoX style
|
||||
@@ -960,6 +1018,14 @@ extern "C" {
|
||||
struct ggml_tensor * v,
|
||||
bool masked);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
struct ggml_tensor * k,
|
||||
struct ggml_tensor * v,
|
||||
struct ggml_tensor * d,
|
||||
bool masked);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_flash_ff(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -983,6 +1049,19 @@ extern "C" {
|
||||
struct ggml_tensor * b,
|
||||
ggml_binary_op_f32_t fun);
|
||||
|
||||
// loss function
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c);
|
||||
|
||||
//
|
||||
// automatic differentiation
|
||||
//
|
||||
@@ -1077,6 +1156,8 @@ extern "C" {
|
||||
struct {
|
||||
int n_iter;
|
||||
|
||||
float sched; // schedule multiplier (fixed, decay or warmup)
|
||||
float decay; // weight decay for AdamW, use 0.0f to disable
|
||||
float alpha; // learning rate
|
||||
float beta1;
|
||||
float beta2;
|
||||
@@ -1101,6 +1182,49 @@ extern "C" {
|
||||
} lbfgs;
|
||||
};
|
||||
|
||||
struct ggml_opt_context {
|
||||
struct ggml_context * ctx;
|
||||
struct ggml_opt_params params;
|
||||
|
||||
int iter;
|
||||
int64_t nx; // number of parameter elements
|
||||
|
||||
bool just_initialized;
|
||||
|
||||
struct {
|
||||
struct ggml_tensor * x; // view of the parameters
|
||||
struct ggml_tensor * g1; // gradient
|
||||
struct ggml_tensor * g2; // gradient squared
|
||||
struct ggml_tensor * m; // first moment
|
||||
struct ggml_tensor * v; // second moment
|
||||
struct ggml_tensor * mh; // first moment hat
|
||||
struct ggml_tensor * vh; // second moment hat
|
||||
struct ggml_tensor * pf; // past function values
|
||||
float fx_best;
|
||||
float fx_prev;
|
||||
int n_no_improvement;
|
||||
} adam;
|
||||
|
||||
struct {
|
||||
struct ggml_tensor * x; // current parameters
|
||||
struct ggml_tensor * xp; // previous parameters
|
||||
struct ggml_tensor * g; // current gradient
|
||||
struct ggml_tensor * gp; // previous gradient
|
||||
struct ggml_tensor * d; // search direction
|
||||
struct ggml_tensor * pf; // past function values
|
||||
struct ggml_tensor * lmal; // the L-BFGS memory alpha
|
||||
struct ggml_tensor * lmys; // the L-BFGS memory ys
|
||||
struct ggml_tensor * lms; // the L-BFGS memory s
|
||||
struct ggml_tensor * lmy; // the L-BFGS memory y
|
||||
float fx_best;
|
||||
float step;
|
||||
int j;
|
||||
int k;
|
||||
int end;
|
||||
int n_no_improvement;
|
||||
} lbfgs;
|
||||
};
|
||||
|
||||
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
||||
|
||||
// optimize the function defined by the tensor f
|
||||
@@ -1109,6 +1233,27 @@ extern "C" {
|
||||
struct ggml_opt_params params,
|
||||
struct ggml_tensor * f);
|
||||
|
||||
// initialize optimizer context
|
||||
GGML_API void ggml_opt_init(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_opt_context * opt,
|
||||
struct ggml_opt_params params,
|
||||
int64_t nx);
|
||||
|
||||
// continue optimizing the function defined by the tensor f
|
||||
GGML_API enum ggml_opt_result ggml_opt_resume(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_opt_context * opt,
|
||||
struct ggml_tensor * f);
|
||||
|
||||
// continue optimizing the function defined by the tensor f
|
||||
GGML_API enum ggml_opt_result ggml_opt_resume_g(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_opt_context * opt,
|
||||
struct ggml_tensor * f,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_cgraph * gb);
|
||||
|
||||
//
|
||||
// quantization
|
||||
//
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#include "ggml-quants-k.h"
|
||||
#include "k_quants.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <math.h>
|
||||
@@ -272,7 +272,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
|
||||
|
||||
//========================- 2-bit (de)-quantization
|
||||
|
||||
void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict y, int k) {
|
||||
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -341,7 +341,7 @@ void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int k) {
|
||||
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -374,26 +374,26 @@ void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_q2_k(const float * restrict x, void * restrict vy, int k) {
|
||||
quantize_row_q2_k_reference(x, vy, k);
|
||||
void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
|
||||
quantize_row_q2_K_reference(x, vy, k);
|
||||
}
|
||||
|
||||
size_t ggml_quantize_q2_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
const int nb = k / QK_K;
|
||||
|
||||
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
||||
(void)hist;
|
||||
|
||||
for (int j = 0; j < nb; j += k) {
|
||||
block_q2_k * restrict y = (block_q2_k *)dst + j/QK_K;
|
||||
quantize_row_q2_k_reference(src + j, y, k);
|
||||
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
||||
quantize_row_q2_K_reference(src + j, y, k);
|
||||
}
|
||||
return (n/QK_K*sizeof(block_q2_k));
|
||||
return (n/QK_K*sizeof(block_q2_K));
|
||||
}
|
||||
|
||||
//========================= 3-bit (de)-quantization
|
||||
|
||||
void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict y, int k) {
|
||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -469,7 +469,7 @@ void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int k) {
|
||||
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
assert(QK_K == 256);
|
||||
const int nb = k / QK_K;
|
||||
@@ -520,26 +520,26 @@ void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_q3_k(const float * restrict x, void * restrict vy, int k) {
|
||||
quantize_row_q3_k_reference(x, vy, k);
|
||||
void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
|
||||
quantize_row_q3_K_reference(x, vy, k);
|
||||
}
|
||||
|
||||
size_t ggml_quantize_q3_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
const int nb = k / QK_K;
|
||||
|
||||
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
||||
(void)hist;
|
||||
|
||||
for (int j = 0; j < nb; j += k) {
|
||||
block_q3_k * restrict y = (block_q3_k *)dst + j/QK_K;
|
||||
quantize_row_q3_k_reference(src + j, y, k);
|
||||
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
||||
quantize_row_q3_K_reference(src + j, y, k);
|
||||
}
|
||||
return (n/QK_K*sizeof(block_q3_k));
|
||||
return (n/QK_K*sizeof(block_q3_K));
|
||||
}
|
||||
|
||||
// ====================== 4-bit (de)-quantization
|
||||
|
||||
void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict y, int k) {
|
||||
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -604,7 +604,7 @@ void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int k) {
|
||||
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -630,26 +630,26 @@ void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_q4_k(const float * restrict x, void * restrict vy, int k) {
|
||||
void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_q4_k * restrict y = vy;
|
||||
quantize_row_q4_k_reference(x, y, k);
|
||||
block_q4_K * restrict y = vy;
|
||||
quantize_row_q4_K_reference(x, y, k);
|
||||
}
|
||||
|
||||
size_t ggml_quantize_q4_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
(void)hist; // TODO: collect histograms
|
||||
for (int j = 0; j < nb; j += k) {
|
||||
block_q4_k * restrict y = (block_q4_k *)dst + j/QK_K;
|
||||
quantize_row_q4_k_reference(src + j, y, k);
|
||||
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
|
||||
quantize_row_q4_K_reference(src + j, y, k);
|
||||
}
|
||||
return (n/QK_K*sizeof(block_q4_k));
|
||||
return (n/QK_K*sizeof(block_q4_K));
|
||||
}
|
||||
|
||||
// ====================== 5-bit (de)-quantization
|
||||
|
||||
void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict y, int k) {
|
||||
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -731,7 +731,7 @@ void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int k) {
|
||||
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -759,26 +759,26 @@ void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_q5_k(const float * restrict x, void * restrict vy, int k) {
|
||||
void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_q5_k * restrict y = vy;
|
||||
quantize_row_q5_k_reference(x, y, k);
|
||||
block_q5_K * restrict y = vy;
|
||||
quantize_row_q5_K_reference(x, y, k);
|
||||
}
|
||||
|
||||
size_t ggml_quantize_q5_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
(void)hist;
|
||||
for (int j = 0; j < nb; j += k) {
|
||||
block_q5_k * restrict y = (block_q5_k *)dst + j/QK_K;
|
||||
quantize_row_q5_k_reference(src + j, y, k);
|
||||
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
||||
quantize_row_q5_K_reference(src + j, y, k);
|
||||
}
|
||||
return (n/QK_K*sizeof(block_q5_k));
|
||||
return (n/QK_K*sizeof(block_q5_K));
|
||||
}
|
||||
|
||||
// ====================== 6-bit (de)-quantization
|
||||
|
||||
void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict y, int k) {
|
||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -842,7 +842,7 @@ void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int k) {
|
||||
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -875,28 +875,28 @@ void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_q6_k(const float * restrict x, void * restrict vy, int k) {
|
||||
void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_q6_k * restrict y = vy;
|
||||
quantize_row_q6_k_reference(x, y, k);
|
||||
block_q6_K * restrict y = vy;
|
||||
quantize_row_q6_K_reference(x, y, k);
|
||||
}
|
||||
|
||||
size_t ggml_quantize_q6_k(const float * src, void * dst, int n, int k, int64_t * hist) {
|
||||
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
(void)hist; // TODO
|
||||
|
||||
for (int j = 0; j < nb; j += k) {
|
||||
block_q6_k * restrict y = (block_q6_k *)dst + j/QK_K;
|
||||
quantize_row_q6_k_reference(src + j, y, k);
|
||||
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
||||
quantize_row_q6_K_reference(src + j, y, k);
|
||||
}
|
||||
return (n/QK_K*sizeof(block_q6_k));
|
||||
return (n/QK_K*sizeof(block_q6_K));
|
||||
}
|
||||
|
||||
//===================================== Q8_K ==============================================
|
||||
|
||||
void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict y, int k) {
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -933,7 +933,7 @@ void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int k) {
|
||||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -944,8 +944,8 @@ void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_q8_k(const float * restrict x, void * restrict y, int k) {
|
||||
quantize_row_q8_k_reference(x, y, k);
|
||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
|
||||
quantize_row_q8_K_reference(x, y, k);
|
||||
}
|
||||
|
||||
//===================================== Dot ptoducts =================================
|
||||
@@ -1002,10 +1002,10 @@ static inline __m128i get_scale_shuffle(int i) {
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_q2_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
|
||||
const block_q2_k * restrict x = vx;
|
||||
const block_q8_k * restrict y = vy;
|
||||
const block_q2_K * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
@@ -1201,14 +1201,14 @@ void ggml_vec_dot_q2_k_q8_k(const int n, float * restrict s, const void * restri
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q3_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
assert(n % QK_K == 0);
|
||||
|
||||
const uint32_t kmask1 = 0x03030303;
|
||||
const uint32_t kmask2 = 0x0f0f0f0f;
|
||||
|
||||
const block_q3_k * restrict x = vx;
|
||||
const block_q8_k * restrict y = vy;
|
||||
const block_q3_K * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
@@ -1501,11 +1501,11 @@ void ggml_vec_dot_q3_k_q8_k(const int n, float * restrict s, const void * restri
|
||||
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
assert(n % QK_K == 0);
|
||||
|
||||
const block_q4_k * restrict x = vx;
|
||||
const block_q8_k * restrict y = vy;
|
||||
const block_q4_K * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
@@ -1519,7 +1519,7 @@ void ggml_vec_dot_q4_k_q8_k(const int n, float * restrict s, const void * restri
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||
#ifdef __ARM_FEATURE_DOTPROD
|
||||
const uint32x4_t mzero = vdupq_n_s32(0);
|
||||
const int32x4_t mzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
|
||||
int8x16x2_t q4bytes;
|
||||
@@ -1727,11 +1727,11 @@ void ggml_vec_dot_q4_k_q8_k(const int n, float * restrict s, const void * restri
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
assert(n % QK_K == 0);
|
||||
|
||||
const block_q5_k * restrict x = vx;
|
||||
const block_q8_k * restrict y = vy;
|
||||
const block_q5_K * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
@@ -1745,7 +1745,7 @@ void ggml_vec_dot_q5_k_q8_k(const int n, float * restrict s, const void * restri
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||
const uint32x4_t mzero = vdupq_n_u32(0);
|
||||
const int32x4_t mzero = vdupq_n_s32(0);
|
||||
const uint8x16_t mone = vdupq_n_u8(1);
|
||||
const uint8x16_t mtwo = vdupq_n_u8(2);
|
||||
|
||||
@@ -1974,11 +1974,11 @@ void ggml_vec_dot_q5_k_q8_k(const int n, float * restrict s, const void * restri
|
||||
|
||||
|
||||
|
||||
void ggml_vec_dot_q6_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
assert(n % QK_K == 0);
|
||||
|
||||
const block_q6_k * restrict x = vx;
|
||||
const block_q8_k * restrict y = vy;
|
||||
const block_q6_K * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
@@ -2242,5 +2242,3 @@ void ggml_vec_dot_q6_k_q8_k(const int n, float * restrict s, const void * restri
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -22,8 +22,8 @@ typedef struct {
|
||||
uint8_t qs[QK_K/4]; // quants
|
||||
ggml_fp16_t d; // super-block scale for quantized scales
|
||||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||
} block_q2_k;
|
||||
static_assert(sizeof(block_q2_k) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_k block size/padding");
|
||||
} block_q2_K;
|
||||
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||
|
||||
// 3-bit quantization
|
||||
// weight is represented as x = a * q
|
||||
@@ -34,8 +34,8 @@ typedef struct {
|
||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
||||
ggml_fp16_t d; // super-block scale
|
||||
} block_q3_k;
|
||||
static_assert(sizeof(block_q3_k) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_k block size/padding");
|
||||
} block_q3_K;
|
||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
|
||||
|
||||
// 4-bit quantization
|
||||
// 16 blocks of 32 elements each
|
||||
@@ -46,8 +46,8 @@ typedef struct {
|
||||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_k;
|
||||
static_assert(sizeof(block_q4_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_k block size/padding");
|
||||
} block_q4_K;
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
||||
|
||||
// 5-bit quantization
|
||||
// 16 blocks of 32 elements each
|
||||
@@ -59,8 +59,8 @@ typedef struct {
|
||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
} block_q5_k;
|
||||
static_assert(sizeof(block_q5_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_k block size/padding");
|
||||
} block_q5_K;
|
||||
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||
|
||||
// 6-bit quantization
|
||||
// weight is represented as x = a * q
|
||||
@@ -71,52 +71,52 @@ typedef struct {
|
||||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
||||
ggml_fp16_t d; // super-block scale
|
||||
} block_q6_k;
|
||||
static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_k block size/padding");
|
||||
} block_q6_K;
|
||||
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
|
||||
|
||||
// This is only used for intermediate quantization and dot products
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
int8_t qs[QK_K]; // quants
|
||||
int16_t bsums[QK_K/16]; // sum of quants in groups of 16
|
||||
} block_q8_k;
|
||||
static_assert(sizeof(block_q8_k) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_k block size/padding");
|
||||
} block_q8_K;
|
||||
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
|
||||
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict y, int k);
|
||||
void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict y, int k);
|
||||
void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict y, int k);
|
||||
void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict y, int k);
|
||||
void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict y, int k);
|
||||
void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict y, int k);
|
||||
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
||||
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
||||
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
||||
|
||||
void quantize_row_q2_k(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q3_k(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q4_k(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q5_k(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q6_k(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_k(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q2_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q3_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q4_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q5_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q6_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
|
||||
// Quantization with histogram collection
|
||||
size_t ggml_quantize_q2_k(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q3_k(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_k(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q5_k(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q6_k(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
457
llama.cpp
457
llama.cpp
@@ -59,6 +59,12 @@ static const size_t MB = 1024*1024;
|
||||
// TODO: dynamically determine these sizes
|
||||
// needs modifications in ggml
|
||||
|
||||
typedef void (*offload_func_t)(struct ggml_tensor * tensor);
|
||||
|
||||
void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
||||
(void) tensor;
|
||||
}
|
||||
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
@@ -173,6 +179,7 @@ struct llama_model {
|
||||
struct ggml_tensor * output;
|
||||
|
||||
std::vector<llama_layer> layers;
|
||||
int n_gpu_layers;
|
||||
|
||||
// context
|
||||
struct ggml_context * ctx = NULL;
|
||||
@@ -198,6 +205,16 @@ struct llama_model {
|
||||
if (ctx) {
|
||||
ggml_free(ctx);
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
||||
ggml_cuda_free_data(tensors_by_name[i].second);
|
||||
}
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
||||
ggml_cl_free_data(tensors_by_name[i].second);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@@ -690,6 +707,9 @@ struct llama_model_loader {
|
||||
|
||||
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
||||
struct ggml_tensor * tensor;
|
||||
if (backend != GGML_BACKEND_CPU) {
|
||||
ggml_set_no_alloc(ggml_ctx, true);
|
||||
}
|
||||
if (lt.ne.size() == 2) {
|
||||
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
||||
} else {
|
||||
@@ -698,6 +718,10 @@ struct llama_model_loader {
|
||||
}
|
||||
ggml_set_name(tensor, lt.name.c_str());
|
||||
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
||||
|
||||
if (backend != GGML_BACKEND_CPU) {
|
||||
ggml_set_no_alloc(ggml_ctx, use_mmap);
|
||||
}
|
||||
tensor->backend = backend;
|
||||
lt.ggml_tensor = tensor;
|
||||
num_ggml_tensors_created++;
|
||||
@@ -713,6 +737,7 @@ struct llama_model_loader {
|
||||
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
||||
size_t data_size = 0;
|
||||
size_t prefetch_size = 0;
|
||||
size_t lock_size = 0;
|
||||
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
||||
data_size += lt.size;
|
||||
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
||||
@@ -722,11 +747,6 @@ struct llama_model_loader {
|
||||
|
||||
if (use_mmap) {
|
||||
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
||||
if (!lmlock) {
|
||||
// Don't call the callback since the actual loading will be lazy
|
||||
// and we can't measure it.
|
||||
progress_callback = NULL;
|
||||
}
|
||||
if (lmlock) {
|
||||
lmlock->init(mapping->addr);
|
||||
}
|
||||
@@ -734,20 +754,49 @@ struct llama_model_loader {
|
||||
|
||||
size_t done_size = 0;
|
||||
for (llama_load_tensor & lt : tensors_map.tensors) {
|
||||
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
||||
continue;
|
||||
}
|
||||
if (progress_callback) {
|
||||
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
||||
}
|
||||
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
|
||||
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
||||
load_data_for(lt);
|
||||
lt.ggml_tensor->data = lt.data;
|
||||
done_size += lt.size;
|
||||
if (use_mmap && lmlock) {
|
||||
lmlock->grow_to(done_size);
|
||||
|
||||
// allocate temp buffer if not using mmap
|
||||
if (!use_mmap && lt.data == NULL) {
|
||||
GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
|
||||
lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
|
||||
}
|
||||
|
||||
load_data_for(lt);
|
||||
|
||||
switch(lt.ggml_tensor->backend) {
|
||||
case GGML_BACKEND_CPU:
|
||||
lt.ggml_tensor->data = lt.data;
|
||||
if (use_mmap && lmlock) {
|
||||
lock_size += lt.size;
|
||||
lmlock->grow_to(lock_size);
|
||||
}
|
||||
break;
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
case GGML_BACKEND_GPU:
|
||||
case GGML_BACKEND_GPU_SPLIT:
|
||||
ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
|
||||
if (!use_mmap) {
|
||||
free(lt.data);
|
||||
}
|
||||
break;
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
case GGML_BACKEND_GPU:
|
||||
ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
|
||||
if (!use_mmap) {
|
||||
free(lt.data);
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
|
||||
done_size += lt.size;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -850,7 +899,10 @@ static bool kv_cache_init(
|
||||
struct llama_context_params llama_context_default_params() {
|
||||
struct llama_context_params result = {
|
||||
/*.n_ctx =*/ 512,
|
||||
/*.n_batch =*/ 512,
|
||||
/*.gpu_layers =*/ 0,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.tensor_split =*/ {0},
|
||||
/*.seed =*/ -1,
|
||||
/*.f16_kv =*/ true,
|
||||
/*.logits_all =*/ false,
|
||||
@@ -865,6 +917,17 @@ struct llama_context_params llama_context_default_params() {
|
||||
return result;
|
||||
}
|
||||
|
||||
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||
struct llama_model_quantize_params result = {
|
||||
/*.nthread =*/ 0,
|
||||
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||
/*.allow_requantize =*/ false,
|
||||
/*.quantize_output_tensor =*/ true,
|
||||
};
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
bool llama_mmap_supported() {
|
||||
return llama_mmap::SUPPORTED;
|
||||
}
|
||||
@@ -944,7 +1007,10 @@ static void llama_model_load_internal(
|
||||
const std::string & fname,
|
||||
llama_context & lctx,
|
||||
int n_ctx,
|
||||
int n_batch,
|
||||
int n_gpu_layers,
|
||||
int main_gpu,
|
||||
const float * tensor_split,
|
||||
ggml_type memory_type,
|
||||
bool use_mmap,
|
||||
bool use_mlock,
|
||||
@@ -959,9 +1025,9 @@ static void llama_model_load_internal(
|
||||
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
|
||||
auto & model = lctx.model;
|
||||
model.hparams = ml->file_loaders.at(0)->hparams;
|
||||
model.n_gpu_layers = n_gpu_layers;
|
||||
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
||||
auto & hparams = model.hparams;
|
||||
uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
||||
|
||||
{
|
||||
switch (hparams.n_layer) {
|
||||
@@ -970,11 +1036,19 @@ static void llama_model_load_internal(
|
||||
case 40: model.type = e_model::MODEL_13B; break;
|
||||
case 60: model.type = e_model::MODEL_30B; break;
|
||||
case 80: model.type = e_model::MODEL_65B; break;
|
||||
default:
|
||||
{
|
||||
if (hparams.n_layer < 32) {
|
||||
model.type = e_model::MODEL_7B;
|
||||
}
|
||||
} break;
|
||||
}
|
||||
|
||||
hparams.n_ctx = n_ctx;
|
||||
}
|
||||
|
||||
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
||||
|
||||
{
|
||||
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
||||
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||
@@ -1037,18 +1111,24 @@ static void llama_model_load_internal(
|
||||
}
|
||||
}
|
||||
|
||||
(void) main_gpu;
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
|
||||
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
||||
ggml_cuda_set_main_device(main_gpu);
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CL
|
||||
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
||||
#else
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
||||
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
||||
#endif
|
||||
|
||||
// prepare memory for the weights
|
||||
size_t vram_total = 0;
|
||||
size_t vram_weights = 0;
|
||||
size_t vram_scratch = 0;
|
||||
{
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
@@ -1063,7 +1143,7 @@ static void llama_model_load_internal(
|
||||
{
|
||||
ggml_backend backend_output;
|
||||
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
||||
backend_output = LLAMA_BACKEND_OFFLOAD;
|
||||
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
||||
} else {
|
||||
backend_output = GGML_BACKEND_CPU;
|
||||
}
|
||||
@@ -1075,7 +1155,8 @@ static void llama_model_load_internal(
|
||||
|
||||
model.layers.resize(n_layer);
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
@@ -1083,21 +1164,21 @@ static void llama_model_load_internal(
|
||||
|
||||
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
||||
|
||||
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
|
||||
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
|
||||
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
|
||||
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
|
||||
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
|
||||
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
|
||||
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
|
||||
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
|
||||
|
||||
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
||||
|
||||
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
|
||||
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
|
||||
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
|
||||
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
|
||||
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
|
||||
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
|
||||
|
||||
if (backend == LLAMA_BACKEND_OFFLOAD) {
|
||||
vram_total +=
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
vram_weights +=
|
||||
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
||||
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
||||
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
||||
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
||||
}
|
||||
}
|
||||
@@ -1112,7 +1193,7 @@ static void llama_model_load_internal(
|
||||
// this is the total memory required to run the inference
|
||||
const size_t mem_required =
|
||||
ctx_size +
|
||||
mmapped_size - vram_total + // weights in VRAM not in memory
|
||||
mmapped_size - vram_weights + // weights in VRAM not in memory
|
||||
MEM_REQ_SCRATCH0().at(model.type) +
|
||||
MEM_REQ_SCRATCH1().at(model.type) +
|
||||
MEM_REQ_EVAL().at (model.type);
|
||||
@@ -1124,14 +1205,25 @@ static void llama_model_load_internal(
|
||||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||
|
||||
(void) vram_scratch;
|
||||
(void) n_batch;
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
vram_scratch = n_batch * MB;
|
||||
ggml_cuda_set_scratch_size(vram_scratch);
|
||||
if (n_gpu_layers > 0) {
|
||||
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
|
||||
__func__, vram_scratch / MB);
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
|
||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
||||
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
|
||||
}
|
||||
fprintf(stderr, "%s: total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
||||
__func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
|
||||
#else
|
||||
(void) n_gpu_layers;
|
||||
#endif
|
||||
@@ -1142,52 +1234,15 @@ static void llama_model_load_internal(
|
||||
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
||||
}
|
||||
|
||||
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
||||
|
||||
(void) tensor_split;
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
{
|
||||
size_t done_size = 0;
|
||||
size_t data_size = 0;
|
||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||
data_size += lt.size;
|
||||
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
||||
done_size += lt.size;
|
||||
}
|
||||
}
|
||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
|
||||
continue;
|
||||
}
|
||||
if (progress_callback) {
|
||||
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
||||
}
|
||||
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
||||
done_size += lt.size;
|
||||
}
|
||||
}
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
{
|
||||
size_t done_size = 0;
|
||||
size_t data_size = 0;
|
||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||
data_size += lt.size;
|
||||
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
||||
done_size += lt.size;
|
||||
}
|
||||
}
|
||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||
if (lt.ggml_tensor->backend != GGML_BACKEND_CL) {
|
||||
continue;
|
||||
}
|
||||
if (progress_callback) {
|
||||
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
||||
}
|
||||
ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
||||
done_size += lt.size;
|
||||
}
|
||||
ggml_cuda_set_tensor_split(tensor_split);
|
||||
}
|
||||
#endif
|
||||
|
||||
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
||||
|
||||
if (progress_callback) {
|
||||
progress_callback(1.0f, progress_callback_user_data);
|
||||
}
|
||||
@@ -1203,7 +1258,10 @@ static bool llama_model_load(
|
||||
const std::string & fname,
|
||||
llama_context & lctx,
|
||||
int n_ctx,
|
||||
int n_batch,
|
||||
int n_gpu_layers,
|
||||
int main_gpu,
|
||||
float * tensor_split,
|
||||
ggml_type memory_type,
|
||||
bool use_mmap,
|
||||
bool use_mlock,
|
||||
@@ -1211,8 +1269,8 @@ static bool llama_model_load(
|
||||
llama_progress_callback progress_callback,
|
||||
void *progress_callback_user_data) {
|
||||
try {
|
||||
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
|
||||
vocab_only, progress_callback, progress_callback_user_data);
|
||||
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
|
||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
||||
return true;
|
||||
} catch (const std::exception & err) {
|
||||
fprintf(stderr, "error loading model: %s\n", err.what());
|
||||
@@ -1253,12 +1311,13 @@ static bool llama_eval_internal(
|
||||
|
||||
LLAMA_ASSERT(!!kv_self.ctx);
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_head = hparams.n_head;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
const int n_rot = hparams.n_embd/hparams.n_head;
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_head = hparams.n_head;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
const int n_rot = hparams.n_embd/hparams.n_head;
|
||||
const int n_gpu_layers = model.n_gpu_layers;
|
||||
|
||||
auto & mem_per_token = lctx.mem_per_token;
|
||||
auto & buf_compute = lctx.buf_compute;
|
||||
@@ -1283,7 +1342,18 @@ static bool llama_eval_internal(
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
(void) i_gpu_start;
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
offload_func_t offload_func = llama_nop;
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (il >= i_gpu_start) {
|
||||
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
lctx.use_buf(ctx0, 0);
|
||||
@@ -1291,20 +1361,32 @@ static bool llama_eval_internal(
|
||||
// norm
|
||||
{
|
||||
cur = ggml_rms_norm(ctx0, inpL);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "rms_norm_0");
|
||||
|
||||
// cur = cur*attention_norm(broadcasted)
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "attention_norm_0");
|
||||
}
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
// offload_func(tmpq);
|
||||
ggml_set_name(tmpq, "tmpq");
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
ggml_set_name(Qcur, "Qcur");
|
||||
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
// offload_func(tmpk);
|
||||
ggml_set_name(tmpk, "tmpk");
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
ggml_set_name(Kcur, "Kcur");
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
ggml_set_name(Qcur, "Qcur");
|
||||
|
||||
// store key and value to memory
|
||||
{
|
||||
// compute the transposed [N, n_embd] V matrix
|
||||
@@ -1312,9 +1394,11 @@ static bool llama_eval_internal(
|
||||
ggml_set_name(Vcur, "Vcur");
|
||||
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
||||
ggml_set_name(k, "k");
|
||||
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
||||
( n_ctx)*ggml_element_size(kv_self.v),
|
||||
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
||||
ggml_set_name(v, "v");
|
||||
|
||||
// important: storing RoPE-ed version of K in the KV cache!
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||
@@ -1389,63 +1473,104 @@ static bool llama_eval_internal(
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].wo,
|
||||
cur);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "result_wo");
|
||||
}
|
||||
|
||||
lctx.use_buf(ctx0, 1);
|
||||
//ggml_cuda_set_scratch(1);
|
||||
|
||||
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
||||
offload_func(inpFF);
|
||||
ggml_set_name(inpFF, "inpFF");
|
||||
|
||||
// feed-forward network
|
||||
{
|
||||
// norm
|
||||
{
|
||||
cur = ggml_rms_norm(ctx0, inpFF);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "rms_norm_1");
|
||||
|
||||
// cur = cur*ffn_norm(broadcasted)
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "ffn_norm");
|
||||
}
|
||||
|
||||
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
||||
model.layers[il].w3,
|
||||
cur);
|
||||
offload_func(tmp);
|
||||
ggml_set_name(tmp, "result_w3");
|
||||
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].w1,
|
||||
cur);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "result_w2");
|
||||
|
||||
// SILU activation
|
||||
cur = ggml_silu(ctx0, cur);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "silu");
|
||||
|
||||
cur = ggml_mul(ctx0, cur, tmp);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "silu_x_result_w3");
|
||||
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].w2,
|
||||
cur);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "result_w2");
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, inpFF);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "inpFF_+_result_w2");
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
|
||||
}
|
||||
|
||||
lctx.use_buf(ctx0, 0);
|
||||
//ggml_cuda_set_scratch(0);
|
||||
|
||||
// used at the end to optionally extract the embeddings
|
||||
struct ggml_tensor * embeddings = NULL;
|
||||
|
||||
offload_func_t offload_func = llama_nop;
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (n_gpu_layers > n_layer) {
|
||||
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
// norm
|
||||
{
|
||||
cur = ggml_rms_norm(ctx0, inpL);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "rms_norm_inpL");
|
||||
|
||||
cur = ggml_rms_norm(ctx0, cur);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "rms_norm_after");
|
||||
|
||||
// cur = cur*norm(broadcasted)
|
||||
cur = ggml_mul(ctx0, cur, model.norm);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "result_norm");
|
||||
|
||||
embeddings = cur;
|
||||
}
|
||||
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
ggml_set_name(cur, "result_output");
|
||||
|
||||
lctx.use_buf(ctx0, -1);
|
||||
|
||||
@@ -2044,6 +2169,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
|
||||
return -log2f(candidate.p) > *mu;
|
||||
}));
|
||||
|
||||
if (candidates->size == 0) {
|
||||
candidates->size = 1;
|
||||
}
|
||||
|
||||
// Normalize the probabilities of the remaining words
|
||||
llama_sample_softmax(ctx, candidates);
|
||||
|
||||
@@ -2112,15 +2241,79 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
||||
// quantization
|
||||
//
|
||||
|
||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
|
||||
static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
|
||||
if (output.size < nelements * sizeof(float)) {
|
||||
output.resize(nelements * sizeof(float));
|
||||
}
|
||||
float * f32_output = (float *) output.addr;
|
||||
|
||||
quantize_fns_t qtype;
|
||||
if (ggml_is_quantized(tensor.type)) {
|
||||
qtype = ggml_internal_get_quantize_fn(tensor.type);
|
||||
if (qtype.dequantize_row_q == NULL) {
|
||||
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
||||
}
|
||||
} else if (tensor.type != GGML_TYPE_F16) {
|
||||
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
|
||||
}
|
||||
|
||||
if (nthread < 2) {
|
||||
if (tensor.type == GGML_TYPE_F16) {
|
||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
||||
} else if (ggml_is_quantized(tensor.type)) {
|
||||
qtype.dequantize_row_q(tensor.data, f32_output, nelements);
|
||||
} else {
|
||||
LLAMA_ASSERT(false); // unreachable
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
|
||||
auto block_size_bytes = ggml_type_size(tensor.type);
|
||||
|
||||
LLAMA_ASSERT(nelements % block_size == 0);
|
||||
auto nblocks = nelements / block_size;
|
||||
auto blocks_per_thread = nblocks / nthread;
|
||||
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
||||
|
||||
std::vector<std::thread> workers;
|
||||
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
||||
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
||||
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
||||
auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
||||
|
||||
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
||||
if (typ == GGML_TYPE_F16) {
|
||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
||||
} else {
|
||||
qtype.dequantize_row_q(inbuf, outbuf, nels);
|
||||
}
|
||||
};
|
||||
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
||||
in_buff_offs += thr_block_bytes;
|
||||
out_buff_offs += thr_elems;
|
||||
}
|
||||
for (auto & worker : workers) {
|
||||
worker.join();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
||||
ggml_type quantized_type;
|
||||
switch (ftype) {
|
||||
llama_ftype ftype = params->ftype;
|
||||
int nthread = params->nthread;
|
||||
|
||||
switch (params->ftype) {
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
||||
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
||||
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
// K-quants
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
||||
@@ -2131,6 +2324,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
||||
#endif
|
||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||
}
|
||||
|
||||
@@ -2140,8 +2334,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
||||
/*vocab_only*/ false));
|
||||
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
||||
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
||||
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
int n_attention_wv = 0;
|
||||
int n_feed_forward_w2 = 0;
|
||||
for (auto& tensor : model_loader->tensors_map.tensors) {
|
||||
@@ -2155,6 +2350,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
int i_attention_wv = 0;
|
||||
int i_feed_forward_w2 = 0;
|
||||
#endif
|
||||
|
||||
size_t total_size_org = 0;
|
||||
size_t total_size_new = 0;
|
||||
@@ -2180,11 +2376,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
// quantize only 2D tensors
|
||||
quantize &= (tensor.ne.size() == 2);
|
||||
|
||||
// uncomment this to keep the output layer in FP16
|
||||
//if (tensor.name == "output.weight") {
|
||||
// quantize = false;
|
||||
//}
|
||||
quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
|
||||
quantize &= quantized_type != tensor.type;
|
||||
|
||||
enum ggml_type new_type;
|
||||
void * new_data;
|
||||
@@ -2198,41 +2391,40 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
||||
} else {
|
||||
new_type = quantized_type;
|
||||
if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K;
|
||||
else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
if (tensor.name == "output.weight") {
|
||||
new_type = GGML_TYPE_Q6_K;
|
||||
} else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
||||
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||
++i_attention_wv;
|
||||
}
|
||||
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
||||
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||
++i_feed_forward_w2;
|
||||
}
|
||||
else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
||||
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
#endif
|
||||
|
||||
float * f32_data;
|
||||
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
||||
llama_buffer f32_conv_buf;
|
||||
|
||||
if (tensor.type == GGML_TYPE_F32) {
|
||||
f32_data = (float *) tensor.data;
|
||||
} else if (tensor.type == GGML_TYPE_F16) {
|
||||
f32_conv_buf.resize(nelements * sizeof(float));
|
||||
f32_data = (float *) f32_conv_buf.addr;
|
||||
const auto * f16_data = (const ggml_fp16_t *) tensor.data;
|
||||
for (size_t i = 0; i < nelements; i++) {
|
||||
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
||||
}
|
||||
} else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
|
||||
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
|
||||
} else {
|
||||
throw std::runtime_error(format("type %s unsupported for integer quantization", ggml_type_name(tensor.type)));
|
||||
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
||||
f32_data = (float *) f32_conv_buf.addr;
|
||||
}
|
||||
|
||||
printf("quantizing .. ");
|
||||
@@ -2360,9 +2552,9 @@ struct llama_context * llama_init_from_file(
|
||||
|
||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
|
||||
params.use_mmap, params.use_mlock, params.vocab_only,
|
||||
params.progress_callback, params.progress_callback_user_data)) {
|
||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
||||
params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
|
||||
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
@@ -2405,17 +2597,30 @@ struct llama_context * llama_init_from_file(
|
||||
// this allocates all Metal resources and memory buffers
|
||||
ctx->ctx_metal = ggml_metal_init();
|
||||
|
||||
void *data_ptr = NULL;
|
||||
size_t data_size = 0;
|
||||
if (params.use_mmap) {
|
||||
ggml_metal_add_buffer(ctx->ctx_metal, "data", ctx->model.mapping->addr, ctx->model.mapping->size);
|
||||
ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size);
|
||||
data_ptr = ctx->model.mapping->addr;
|
||||
data_size= ctx->model.mapping->size;
|
||||
} else {
|
||||
ggml_metal_add_buffer(ctx->ctx_metal, "data", ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_size(ctx->model.ctx));
|
||||
ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size);
|
||||
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
||||
data_size= ggml_get_mem_size(ctx->model.ctx);
|
||||
}
|
||||
|
||||
ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size);
|
||||
ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size);
|
||||
ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size);
|
||||
#define LLAMA_METAL_CHECK_BUF(result) \
|
||||
if (!(result)) { \
|
||||
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
||||
llama_free(ctx); \
|
||||
return NULL; \
|
||||
}
|
||||
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
|
||||
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
|
||||
#undef LLAMA_METAL_CHECK_BUF
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -2429,10 +2634,9 @@ void llama_free(struct llama_context * ctx) {
|
||||
int llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
const char * fname_out,
|
||||
enum llama_ftype ftype,
|
||||
int nthread) {
|
||||
const llama_model_quantize_params *params) {
|
||||
try {
|
||||
llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
|
||||
llama_model_quantize_internal(fname_inp, fname_out, params);
|
||||
return 0;
|
||||
} catch (const std::exception & err) {
|
||||
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
|
||||
@@ -3095,6 +3299,19 @@ int llama_n_embd(const struct llama_context * ctx) {
|
||||
return ctx->model.hparams.n_embd;
|
||||
}
|
||||
|
||||
int llama_get_vocab(
|
||||
const struct llama_context * ctx,
|
||||
const char * * strings,
|
||||
float * scores,
|
||||
int capacity) {
|
||||
int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
|
||||
for (int i = 0; i<n; ++i) {
|
||||
strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
|
||||
scores[i] = ctx->vocab.id_to_token[i].score;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
float * llama_get_logits(struct llama_context * ctx) {
|
||||
return ctx->logits.data();
|
||||
}
|
||||
|
||||
38
llama.h
38
llama.h
@@ -1,6 +1,13 @@
|
||||
#ifndef LLAMA_H
|
||||
#define LLAMA_H
|
||||
|
||||
#include "ggml.h"
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
||||
#else
|
||||
#define LLAMA_MAX_DEVICES 1
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
@@ -65,9 +72,12 @@ extern "C" {
|
||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||
|
||||
struct llama_context_params {
|
||||
int n_ctx; // text context
|
||||
int n_gpu_layers; // number of layers to store in VRAM
|
||||
int seed; // RNG seed, -1 for random
|
||||
int n_ctx; // text context
|
||||
int n_batch; // prompt processing batch size
|
||||
int n_gpu_layers; // number of layers to store in VRAM
|
||||
int main_gpu; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
||||
int seed; // RNG seed, -1 for random
|
||||
|
||||
bool f16_kv; // use fp16 for KV cache
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||
@@ -105,7 +115,16 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
typedef struct llama_model_quantize_params {
|
||||
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
bool quantize_output_tensor; // quantize output.weight
|
||||
} llama_model_quantize_params;
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
||||
|
||||
LLAMA_API bool llama_mmap_supported();
|
||||
LLAMA_API bool llama_mlock_supported();
|
||||
@@ -127,14 +146,11 @@ extern "C" {
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
// TODO: not great API - very likely to change
|
||||
// Returns 0 on success
|
||||
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
||||
LLAMA_API int llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
const char * fname_out,
|
||||
enum llama_ftype ftype,
|
||||
int nthread);
|
||||
const llama_model_quantize_params * params);
|
||||
|
||||
// Apply a LoRA adapter to a loaded model
|
||||
// path_base_model is the path to a higher quality model to use as a base for
|
||||
@@ -204,6 +220,14 @@ extern "C" {
|
||||
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
||||
|
||||
// Get the vocabulary as output parameters.
|
||||
// Returns number of results.
|
||||
LLAMA_API int llama_get_vocab(
|
||||
const struct llama_context * ctx,
|
||||
const char * * strings,
|
||||
float * scores,
|
||||
int capacity);
|
||||
|
||||
// Token logits obtained from the last call to llama_eval()
|
||||
// The logits for the last token are stored in the last row
|
||||
// Can be mutated in order to change the probabilities of the next token
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
|
||||
#define MAX_NARGS 2
|
||||
#define MAX_NARGS 3
|
||||
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
@@ -1090,6 +1090,25 @@ int main(int argc, const char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
// cross_entropy_loss
|
||||
{
|
||||
const int nargs = 1;
|
||||
|
||||
int64_t ne2[4];
|
||||
get_random_dims(ne2, 4);
|
||||
|
||||
for (int ndims = 1; ndims <= 3; ++ndims) {
|
||||
x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
|
||||
x[1] = get_random_tensor(ctx0, ndims, ne2, 0.0f, 1.0f);
|
||||
ggml_set_param(ctx0, x[0]);
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
|
||||
|
||||
check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY);
|
||||
// finite differences regularly fails!
|
||||
}
|
||||
}
|
||||
|
||||
// rope
|
||||
{
|
||||
const int nargs = 1;
|
||||
@@ -1124,6 +1143,45 @@ int main(int argc, const char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
// flash_attn
|
||||
{
|
||||
const int nargs = 3;
|
||||
|
||||
int64_t ne2[4];
|
||||
|
||||
get_random_dims(ne2, 4);
|
||||
int64_t D = ne2[0];
|
||||
int64_t N = ne2[1];
|
||||
int64_t M = ne2[2] + N;
|
||||
int64_t B = ne2[3];
|
||||
|
||||
for (int masked = 0; masked <= 1; ++masked) {
|
||||
for (int ndims = 2; ndims <= 4; ++ndims) {
|
||||
int64_t neq[4] = { D, N, B, ne[3] };
|
||||
int64_t nek[4] = { D, M, B, ne[3] };
|
||||
int64_t nev[4] = { M, D, B, ne[3] };
|
||||
if (ndims == 2) {
|
||||
neq[2] = 1; neq[3] = 1;
|
||||
nek[2] = 1; nek[3] = 1;
|
||||
nev[2] = 1; nev[3] = 1;
|
||||
} else if (ndims == 3) {
|
||||
neq[3] = 1;
|
||||
nek[3] = 1;
|
||||
nev[3] = 1;
|
||||
}
|
||||
x[0] = get_random_tensor(ctx0, ndims, neq, -0.1250f, 0.1250f);
|
||||
x[1] = get_random_tensor(ctx0, ndims, nek, -0.1250f, 0.1250f);
|
||||
x[2] = get_random_tensor(ctx0, ndims, nev, -0.1250f, 0.1250f);
|
||||
ggml_set_param(ctx0, x[0]);
|
||||
ggml_set_param(ctx0, x[1]);
|
||||
ggml_set_param(ctx0, x[2]);
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
|
||||
|
||||
check_gradient("flash_attn", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
|
||||
}
|
||||
}
|
||||
}
|
||||
ggml_free(ctx0);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user