mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
8 Commits
master-3de
...
master-2e6
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2e6cd4b025 | ||
|
|
7e4ea5beff | ||
|
|
7780e4f479 | ||
|
|
265db9834e | ||
|
|
fab49c685e | ||
|
|
b8ee340abe | ||
|
|
9ecb30f959 | ||
|
|
29cf5596fe |
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@@ -165,7 +165,7 @@ jobs:
|
||||
- build: 'clblast'
|
||||
defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
- build: 'openblas'
|
||||
defines: '-DLLAMA_OPENBLAS=ON -DBLAS_LIBRARIES="/LIBPATH:$env:RUNNER_TEMP/openblas/lib" -DOPENBLAS_INC="$env:RUNNER_TEMP/openblas/include"'
|
||||
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
67
BLIS.md
Normal file
67
BLIS.md
Normal file
@@ -0,0 +1,67 @@
|
||||
BLIS Installation Manual
|
||||
------------------------
|
||||
|
||||
BLIS is a portable software framework for high-performance BLAS-like dense linear algebra libraries. It has received awards and recognition, including the 2023 James H. Wilkinson Prize for Numerical Software and the 2020 SIAM Activity Group on Supercomputing Best Paper Prize. BLIS provides a new BLAS-like API and a compatibility layer for traditional BLAS routine calls. It offers features such as object-based API, typed API, BLAS and CBLAS compatibility layers.
|
||||
|
||||
Project URL: https://github.com/flame/blis
|
||||
|
||||
### Prepare:
|
||||
|
||||
Compile BLIS:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/flame/blis
|
||||
cd blis
|
||||
./configure --enable-cblas -t openmp,pthreads auto
|
||||
# will install to /usr/local/ by default.
|
||||
make -j
|
||||
```
|
||||
|
||||
Install BLIS:
|
||||
|
||||
```bash
|
||||
sudo make install
|
||||
```
|
||||
|
||||
We recommend using openmp since it's easier to modify the cores been used.
|
||||
|
||||
### llama.cpp compilation
|
||||
|
||||
Makefile:
|
||||
|
||||
```bash
|
||||
make LLAMA_BLIS=1 -j
|
||||
# make LLAMA_BLIS=1 benchmark-matmult
|
||||
```
|
||||
|
||||
CMake:
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
|
||||
make -j
|
||||
```
|
||||
|
||||
### llama.cpp execution
|
||||
|
||||
According to the BLIS documentation, we could set the following
|
||||
environment variables to modify the behavior of openmp:
|
||||
|
||||
```
|
||||
export GOMP_GPU_AFFINITY="0-19"
|
||||
export BLIS_NUM_THREADS=14
|
||||
```
|
||||
|
||||
And then run the binaries as normal.
|
||||
|
||||
|
||||
### Intel specific issue
|
||||
|
||||
Some might get the error message saying that `libimf.so` cannot be found.
|
||||
Please follow this [stackoverflow page](https://stackoverflow.com/questions/70687930/intel-oneapi-2022-libimf-so-no-such-file-or-directory-during-openmpi-compila).
|
||||
|
||||
### Reference:
|
||||
|
||||
1. https://github.com/flame/blis#getting-started
|
||||
2. https://github.com/flame/blis/blob/master/docs/Multithreading.md
|
||||
@@ -65,12 +65,14 @@ endif()
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
|
||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
||||
option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic)
|
||||
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" OFF)
|
||||
|
||||
#
|
||||
# Build info header
|
||||
@@ -145,36 +147,28 @@ if (APPLE AND LLAMA_ACCELERATE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_OPENBLAS)
|
||||
if (LLAMA_BLAS)
|
||||
if (LLAMA_STATIC)
|
||||
set(BLA_STATIC ON)
|
||||
endif()
|
||||
|
||||
set(BLA_VENDOR OpenBLAS)
|
||||
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
|
||||
set(BLA_SIZEOF_INTEGER 8)
|
||||
endif()
|
||||
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
|
||||
find_package(BLAS)
|
||||
if (BLAS_FOUND)
|
||||
message(STATUS "OpenBLAS found")
|
||||
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
|
||||
|
||||
add_compile_options(${BLAS_LINKER_FLAGS})
|
||||
add_compile_definitions(GGML_USE_OPENBLAS)
|
||||
add_link_options(${BLAS_LIBRARIES})
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
||||
|
||||
# find header file
|
||||
set(OPENBLAS_INCLUDE_SEARCH_PATHS
|
||||
/usr/include
|
||||
/usr/include/openblas
|
||||
/usr/include/openblas-base
|
||||
/usr/local/include
|
||||
/usr/local/include/openblas
|
||||
/usr/local/include/openblas-base
|
||||
/opt/OpenBLAS/include
|
||||
$ENV{OpenBLAS_HOME}
|
||||
$ENV{OpenBLAS_HOME}/include
|
||||
)
|
||||
find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
|
||||
add_compile_options(-I${OPENBLAS_INC})
|
||||
message("${BLAS_LIBRARIES} ${BLAS_INCLUDE_DIRS}")
|
||||
include_directories(${BLAS_INCLUDE_DIRS})
|
||||
else()
|
||||
message(WARNING "OpenBLAS not found")
|
||||
message(WARNING "BLAS not found, please refer to "
|
||||
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
||||
" to set correct LLAMA_BLAS_VENDOR")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -207,7 +201,7 @@ if (LLAMA_CLBLAST)
|
||||
if (CLBlast_FOUND)
|
||||
message(STATUS "CLBlast found")
|
||||
|
||||
set(GGML_OPENCL_SOURCES ggml-opencl.c ggml-opencl.h)
|
||||
set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
|
||||
|
||||
add_compile_definitions(GGML_USE_CLBLAST)
|
||||
|
||||
|
||||
11
Makefile
11
Makefile
@@ -122,6 +122,10 @@ ifdef LLAMA_OPENBLAS
|
||||
LDFLAGS += -lopenblas
|
||||
endif
|
||||
endif
|
||||
ifdef LLAMA_BLIS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||
LDFLAGS += -lblis -L/usr/local/lib
|
||||
endif
|
||||
ifdef LLAMA_CUBLAS
|
||||
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
@@ -134,6 +138,7 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
endif
|
||||
ifdef LLAMA_CLBLAST
|
||||
CFLAGS += -DGGML_USE_CLBLAST
|
||||
CXXFLAGS += -DGGML_USE_CLBLAST
|
||||
# Mac provides OpenCL as a framework
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
LDFLAGS += -lclblast -framework OpenCL
|
||||
@@ -141,8 +146,8 @@ ifdef LLAMA_CLBLAST
|
||||
LDFLAGS += -lclblast -lOpenCL
|
||||
endif
|
||||
OBJS += ggml-opencl.o
|
||||
ggml-opencl.o: ggml-opencl.c ggml-opencl.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif
|
||||
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
||||
# Apple M1, M2, etc.
|
||||
@@ -241,6 +246,6 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
|
||||
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
.PHONY: tests
|
||||
.PHONY: tests clean
|
||||
tests:
|
||||
bash ./tests/run-tests.sh
|
||||
|
||||
19
README.md
19
README.md
@@ -56,7 +56,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
|
||||
- Mixed F16 / F32 precision
|
||||
- 4-bit, 5-bit and 8-bit integer quantization support
|
||||
- Runs on the CPU
|
||||
- OpenBLAS support
|
||||
- Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
|
||||
- cuBLAS and CLBlast support
|
||||
|
||||
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
|
||||
@@ -274,10 +274,25 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_OPENBLAS=ON
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
- BLIS
|
||||
|
||||
Check [BLIS.md](BLIS.md) for more information.
|
||||
|
||||
- Intel MKL
|
||||
|
||||
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake --build . -config Release
|
||||
```
|
||||
|
||||
- cuBLAS
|
||||
|
||||
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
|
||||
|
||||
@@ -37,4 +37,7 @@ else()
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(benchmark)
|
||||
add_subdirectory(baby-llama)
|
||||
if(LLAMA_BUILD_SERVER)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
8
examples/server/CMakeLists.txt
Normal file
8
examples/server/CMakeLists.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
set(TARGET server)
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
add_executable(${TARGET} server.cpp json.hpp httplib.h)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
311
examples/server/README.md
Normal file
311
examples/server/README.md
Normal file
@@ -0,0 +1,311 @@
|
||||
# llama.cpp/example/server
|
||||
|
||||
This example allow you to have a llama.cpp http server to interact from a web page or consume the API.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [Quick Start](#quick-start)
|
||||
2. [Node JS Test](#node-js-test)
|
||||
3. [API Endpoints](#api-endpoints)
|
||||
4. [More examples](#more-examples)
|
||||
5. [Common Options](#common-options)
|
||||
6. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
|
||||
|
||||
## Quick Start
|
||||
|
||||
To get started right away, run the following command, making sure to use the correct path for the model you have:
|
||||
|
||||
#### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./server -m models/7B/ggml-model.bin --ctx_size 2048
|
||||
```
|
||||
|
||||
#### Windows:
|
||||
|
||||
```powershell
|
||||
server.exe -m models\7B\ggml-model.bin --ctx_size 2048
|
||||
```
|
||||
|
||||
That will start a server that by default listens on `127.0.0.1:8080`. You can consume the endpoints with Postman or NodeJS with axios library.
|
||||
|
||||
## Node JS Test
|
||||
|
||||
You need to have [Node.js](https://nodejs.org/en) installed.
|
||||
|
||||
```bash
|
||||
mkdir llama-client
|
||||
cd llama-client
|
||||
npm init
|
||||
npm install axios
|
||||
```
|
||||
|
||||
Create a index.js file and put inside this:
|
||||
|
||||
```javascript
|
||||
const axios = require("axios");
|
||||
|
||||
const prompt = `Building a website can be done in 10 simple steps:`;
|
||||
|
||||
async function Test() {
|
||||
let result = await axios.post("http://127.0.0.1:8080/completion", {
|
||||
prompt,
|
||||
batch_size: 128,
|
||||
n_predict: 512,
|
||||
});
|
||||
|
||||
// the response is received until completion finish
|
||||
console.log(result.data.content);
|
||||
}
|
||||
|
||||
Test();
|
||||
```
|
||||
|
||||
And run it:
|
||||
|
||||
```bash
|
||||
node .
|
||||
```
|
||||
|
||||
## API Endpoints
|
||||
|
||||
You can interact with this API Endpoints. This implementations just support chat style interaction.
|
||||
|
||||
- **POST** `hostname:port/completion`: Setting up the Llama Context to begin the completions tasks.
|
||||
|
||||
*Options:*
|
||||
|
||||
`batch_size`: Set the batch size for prompt processing (default: 512).
|
||||
|
||||
`temperature`: Adjust the randomness of the generated text (default: 0.8).
|
||||
|
||||
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
|
||||
|
||||
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
|
||||
|
||||
`n_predict`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
|
||||
|
||||
`threads`: Set the number of threads to use during computation.
|
||||
|
||||
`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
||||
|
||||
`as_loop`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
||||
|
||||
`interactive`: It allows interacting with the completion, and the completion stops as soon as it encounters a `stop word`. To enable this, set to `true`.
|
||||
|
||||
`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate.
|
||||
|
||||
`stop`: Specify the words or characters that indicate a stop. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration.
|
||||
|
||||
`exclude`: Specify the words or characters you do not want to appear in the completion. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration.
|
||||
|
||||
- **POST** `hostname:port/embedding`: Generate embedding of a given text
|
||||
|
||||
*Options:*
|
||||
|
||||
`content`: Set the text to get generate the embedding.
|
||||
|
||||
`threads`: Set the number of threads to use during computation.
|
||||
|
||||
To use this endpoint, you need to start the server with the `--embedding` option added.
|
||||
|
||||
- **POST** `hostname:port/tokenize`: Tokenize a given text
|
||||
|
||||
*Options:*
|
||||
|
||||
`content`: Set the text to tokenize.
|
||||
|
||||
- **GET** `hostname:port/next-token`: Receive the next token predicted, execute this request in a loop. Make sure set `as_loop` as `true` in the completion request.
|
||||
|
||||
*Options:*
|
||||
|
||||
`stop`: Set `hostname:port/next-token?stop=true` to stop the token generation.
|
||||
|
||||
## More examples
|
||||
|
||||
### Interactive mode
|
||||
|
||||
This mode allows interacting in a chat-like manner. It is recommended for models designed as assistants such as `Vicuna`, `WizardLM`, `Koala`, among others. Make sure to add the correct stop word for the corresponding model.
|
||||
|
||||
The prompt should be generated by you, according to the model's guidelines. You should keep adding the model's completions to the context as well.
|
||||
|
||||
This example works well for `Vicuna - version 1`.
|
||||
|
||||
```javascript
|
||||
const axios = require("axios");
|
||||
|
||||
let prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
|
||||
### Human: Hello, Assistant.
|
||||
### Assistant: Hello. How may I help you today?
|
||||
### Human: Please tell me the largest city in Europe.
|
||||
### Assistant: Sure. The largest city in Europe is Moscow, the capital of Russia.`;
|
||||
|
||||
async function ChatCompletion(answer) {
|
||||
// the user's next question to the prompt
|
||||
prompt += `\n### Human: ${answer}\n`
|
||||
|
||||
result = await axios.post("http://127.0.0.1:8080/completion", {
|
||||
prompt,
|
||||
batch_size: 128,
|
||||
temperature: 0.2,
|
||||
top_k: 40,
|
||||
top_p: 0.9,
|
||||
n_keep: -1,
|
||||
n_predict: 2048,
|
||||
stop: ["\n### Human:"], // when detect this, stop completion
|
||||
exclude: ["### Assistant:"], // no show in the completion
|
||||
threads: 8,
|
||||
as_loop: true, // use this to request the completion token by token
|
||||
interactive: true, // enable the detection of a stop word
|
||||
});
|
||||
|
||||
// create a loop to receive every token predicted
|
||||
// note: this operation is blocking, avoid use this in a ui thread
|
||||
|
||||
let message = "";
|
||||
while (true) {
|
||||
// you can stop the inference adding '?stop=true' like this http://127.0.0.1:8080/next-token?stop=true
|
||||
result = await axios.get("http://127.0.0.1:8080/next-token");
|
||||
process.stdout.write(result.data.content);
|
||||
message += result.data.content;
|
||||
|
||||
// to avoid an infinite loop
|
||||
if (result.data.stop) {
|
||||
console.log("Completed");
|
||||
// make sure to add the completion to the prompt.
|
||||
prompt += `### Assistant: ${message}`;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This function should be called every time a question to the model is needed.
|
||||
async function Test() {
|
||||
// the server can't inference in paralell
|
||||
await ChatCompletion("Write a long story about a time magician in a fantasy world");
|
||||
await ChatCompletion("Summary the story");
|
||||
}
|
||||
|
||||
Test();
|
||||
```
|
||||
|
||||
### Alpaca example
|
||||
|
||||
**Temporaly note:** no tested, if you have the model, please test it and report me some issue
|
||||
|
||||
```javascript
|
||||
const axios = require("axios");
|
||||
|
||||
let prompt = `Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
||||
`;
|
||||
|
||||
async function DoInstruction(instruction) {
|
||||
prompt += `\n\n### Instruction:\n\n${instruction}\n\n### Response:\n\n`;
|
||||
result = await axios.post("http://127.0.0.1:8080/completion", {
|
||||
prompt,
|
||||
batch_size: 128,
|
||||
temperature: 0.2,
|
||||
top_k: 40,
|
||||
top_p: 0.9,
|
||||
n_keep: -1,
|
||||
n_predict: 2048,
|
||||
stop: ["### Instruction:\n\n"], // when detect this, stop completion
|
||||
exclude: [], // no show in the completion
|
||||
threads: 8,
|
||||
as_loop: true, // use this to request the completion token by token
|
||||
interactive: true, // enable the detection of a stop word
|
||||
});
|
||||
|
||||
// create a loop to receive every token predicted
|
||||
// note: this operation is blocking, avoid use this in a ui thread
|
||||
|
||||
let message = "";
|
||||
while (true) {
|
||||
result = await axios.get("http://127.0.0.1:8080/next-token");
|
||||
process.stdout.write(result.data.content);
|
||||
message += result.data.content;
|
||||
|
||||
// to avoid an infinite loop
|
||||
if (result.data.stop) {
|
||||
console.log("Completed");
|
||||
// make sure to add the completion and the user's next question to the prompt.
|
||||
prompt += message;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This function should be called every time a instruction to the model is needed.
|
||||
DoInstruction("Destroy the world"); // as joke
|
||||
```
|
||||
|
||||
### Embeddings
|
||||
|
||||
First, run the server with `--embedding` option:
|
||||
|
||||
```bash
|
||||
server -m models/7B/ggml-model.bin --ctx_size 2048 --embedding
|
||||
```
|
||||
|
||||
Run this code in NodeJS:
|
||||
|
||||
```javascript
|
||||
const axios = require('axios');
|
||||
|
||||
async function Test() {
|
||||
let result = await axios.post("http://127.0.0.1:8080/embedding", {
|
||||
content: `Hello`,
|
||||
threads: 5
|
||||
});
|
||||
// print the embedding array
|
||||
console.log(result.data.embedding);
|
||||
}
|
||||
|
||||
Test();
|
||||
```
|
||||
|
||||
### Tokenize
|
||||
|
||||
Run this code in NodeJS:
|
||||
|
||||
```javascript
|
||||
const axios = require('axios');
|
||||
|
||||
async function Test() {
|
||||
let result = await axios.post("http://127.0.0.1:8080/tokenize", {
|
||||
content: `Hello`
|
||||
});
|
||||
// print the embedding array
|
||||
console.log(result.data.tokens);
|
||||
}
|
||||
|
||||
Test();
|
||||
```
|
||||
|
||||
## Common Options
|
||||
|
||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||
- `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
||||
- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
|
||||
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
|
||||
- `--port`: Set the port to listen. Default: `8080`.
|
||||
|
||||
### RNG Seed
|
||||
|
||||
- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
|
||||
|
||||
The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
|
||||
|
||||
## Performance Tuning and Memory Options
|
||||
|
||||
### No Memory Mapping
|
||||
|
||||
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance.
|
||||
|
||||
### Memory Float 32
|
||||
|
||||
- `--memory_f32`: Use 32-bit floats instead of 16-bit floats for memory key+value, allowing higher quality inference at the cost of higher memory usage.
|
||||
|
||||
## Limitations:
|
||||
|
||||
- The actual implementation of llama.cpp need a `llama-state` for handle multiple contexts and clients, but this could require more powerful hardware.
|
||||
8794
examples/server/httplib.h
Normal file
8794
examples/server/httplib.h
Normal file
File diff suppressed because it is too large
Load Diff
24596
examples/server/json.hpp
Normal file
24596
examples/server/json.hpp
Normal file
File diff suppressed because it is too large
Load Diff
721
examples/server/server.cpp
Normal file
721
examples/server/server.cpp
Normal file
@@ -0,0 +1,721 @@
|
||||
#include <httplib.h>
|
||||
#include <json.hpp>
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
struct server_params
|
||||
{
|
||||
std::string hostname = "127.0.0.1";
|
||||
int32_t port = 8080;
|
||||
};
|
||||
|
||||
struct llama_server_context
|
||||
{
|
||||
bool as_loop = false;
|
||||
bool has_next_token = false;
|
||||
std::string generated_text = "";
|
||||
|
||||
int32_t num_tokens_predicted = 0;
|
||||
int32_t n_past = 0;
|
||||
int32_t n_consumed = 0;
|
||||
int32_t n_session_consumed = 0;
|
||||
int32_t n_remain = 0;
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
std::vector<llama_token> last_n_tokens;
|
||||
std::vector<llama_token> processed_tokens;
|
||||
std::vector<llama_token> llama_token_newline;
|
||||
std::vector<llama_token> embd_inp;
|
||||
std::vector<std::vector<llama_token>> no_show_words;
|
||||
std::vector<llama_token> tokens_predicted;
|
||||
|
||||
llama_context *ctx;
|
||||
gpt_params params;
|
||||
|
||||
void rewind() {
|
||||
as_loop = false;
|
||||
params.antiprompt.clear();
|
||||
no_show_words.clear();
|
||||
num_tokens_predicted = 0;
|
||||
generated_text = "";
|
||||
}
|
||||
|
||||
bool loadModel(gpt_params params_)
|
||||
{
|
||||
params = params_;
|
||||
ctx = llama_init_from_gpt_params(params);
|
||||
if (ctx == NULL)
|
||||
{
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
return false;
|
||||
}
|
||||
// determine newline token
|
||||
llama_token_newline = ::llama_tokenize(ctx, "\n", false);
|
||||
last_n_tokens.resize(params.n_ctx);
|
||||
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool loadPrompt() {
|
||||
params.prompt.insert(0, 1, ' '); // always add a first space
|
||||
std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
// compare the evaluated prompt with the new prompt
|
||||
int new_prompt_len = 0;
|
||||
for (int i = 0;i < prompt_tokens.size(); i++) {
|
||||
if (i < processed_tokens.size() &&
|
||||
processed_tokens[i] == prompt_tokens[i])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
embd_inp.push_back(prompt_tokens[i]);
|
||||
if(new_prompt_len == 0) {
|
||||
if(i - 1 < n_past) {
|
||||
processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
|
||||
}
|
||||
// Evaluate the new fragment prompt from the last token processed.
|
||||
n_past = processed_tokens.size();
|
||||
}
|
||||
new_prompt_len ++;
|
||||
}
|
||||
}
|
||||
if(n_past > 0 && params.interactive) {
|
||||
n_remain -= new_prompt_len;
|
||||
}
|
||||
if ((int)embd_inp.size() > params.n_ctx - 4)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
has_next_token = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
void beginCompletion()
|
||||
{
|
||||
if(n_remain == 0) {
|
||||
// number of tokens to keep when resetting context
|
||||
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size())
|
||||
{
|
||||
params.n_keep = (int)embd_inp.size();
|
||||
}
|
||||
}
|
||||
n_remain = params.n_predict;
|
||||
}
|
||||
|
||||
llama_token nextToken() {
|
||||
llama_token result = -1;
|
||||
if (embd.size() > 0)
|
||||
{
|
||||
if (n_past + (int)embd.size() > params.n_ctx)
|
||||
{
|
||||
// Reset context
|
||||
const int n_left = n_past - params.n_keep;
|
||||
n_past = std::max(1, params.n_keep);
|
||||
processed_tokens.erase(processed_tokens.begin() + n_past, processed_tokens.end());
|
||||
embd.insert(embd.begin(), last_n_tokens.begin() + params.n_ctx - n_left / 2 - embd.size(), last_n_tokens.end() - embd.size());
|
||||
}
|
||||
for (int i = 0; i < (int)embd.size(); i += params.n_batch)
|
||||
{
|
||||
int n_eval = (int)embd.size() - i;
|
||||
if (n_eval > params.n_batch)
|
||||
{
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads))
|
||||
{
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
has_next_token = false;
|
||||
return result;
|
||||
}
|
||||
n_past += n_eval;
|
||||
}
|
||||
}
|
||||
embd.clear();
|
||||
if ((int)embd_inp.size() <= n_consumed && has_next_token)
|
||||
{
|
||||
// out of user input, sample next token
|
||||
const float temp = params.temp;
|
||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float tfs_z = params.tfs_z;
|
||||
const float typical_p = params.typical_p;
|
||||
const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n;
|
||||
const float repeat_penalty = params.repeat_penalty;
|
||||
const float alpha_presence = params.presence_penalty;
|
||||
const float alpha_frequency = params.frequency_penalty;
|
||||
const int mirostat = params.mirostat;
|
||||
const float mirostat_tau = params.mirostat_tau;
|
||||
const float mirostat_eta = params.mirostat_eta;
|
||||
const bool penalize_nl = params.penalize_nl;
|
||||
llama_token id = 0;
|
||||
{
|
||||
auto logits = llama_get_logits(ctx);
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++)
|
||||
{
|
||||
logits[it->first] += it->second;
|
||||
}
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++)
|
||||
{
|
||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
|
||||
|
||||
// Apply penalties
|
||||
float nl_logit = logits[llama_token_nl()];
|
||||
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
|
||||
llama_sample_repetition_penalty(ctx, &candidates_p,
|
||||
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
last_n_repeat, repeat_penalty);
|
||||
llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
||||
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
last_n_repeat, alpha_frequency, alpha_presence);
|
||||
if (!penalize_nl)
|
||||
{
|
||||
logits[llama_token_nl()] = nl_logit;
|
||||
}
|
||||
|
||||
if (temp <= 0)
|
||||
{
|
||||
// Greedy sampling
|
||||
id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (mirostat == 1)
|
||||
{
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
const int mirostat_m = 100;
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
||||
}
|
||||
else if (mirostat == 2)
|
||||
{
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Temperature sampling
|
||||
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
|
||||
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
|
||||
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token(ctx, &candidates_p);
|
||||
}
|
||||
}
|
||||
last_n_tokens.erase(last_n_tokens.begin());
|
||||
last_n_tokens.push_back(id);
|
||||
processed_tokens.push_back(id);
|
||||
num_tokens_predicted++;
|
||||
}
|
||||
|
||||
// replace end of text token with newline token when in interactive mode
|
||||
if (id == llama_token_eos() && params.interactive)
|
||||
{
|
||||
id = llama_token_newline.front();
|
||||
if (params.antiprompt.size() != 0)
|
||||
{
|
||||
// tokenize and inject first reverse prompt
|
||||
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
|
||||
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
|
||||
}
|
||||
}
|
||||
|
||||
// add it to the context
|
||||
embd.push_back(id);
|
||||
for (auto id : embd)
|
||||
{
|
||||
result = id;
|
||||
}
|
||||
// decrement remaining sampling budget
|
||||
--n_remain;
|
||||
}
|
||||
else
|
||||
{
|
||||
// some user input remains from prompt or interaction, forward it to processing
|
||||
while ((int)embd_inp.size() > n_consumed)
|
||||
{
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
last_n_tokens.erase(last_n_tokens.begin());
|
||||
last_n_tokens.push_back(embd_inp[n_consumed]);
|
||||
processed_tokens.push_back(embd_inp[n_consumed]);
|
||||
++n_consumed;
|
||||
if ((int)embd.size() >= params.n_batch)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (params.interactive && (int)embd_inp.size() <= n_consumed)
|
||||
{
|
||||
// check for reverse prompt
|
||||
if (params.antiprompt.size())
|
||||
{
|
||||
std::string last_output;
|
||||
for (auto id : last_n_tokens)
|
||||
{
|
||||
last_output += llama_token_to_str(ctx, id);
|
||||
}
|
||||
has_next_token = true;
|
||||
// Check if each of the reverse prompts appears at the end of the output.
|
||||
for (std::string &antiprompt : params.antiprompt)
|
||||
{
|
||||
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos)
|
||||
{
|
||||
has_next_token = false;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (n_past > 0)
|
||||
{
|
||||
has_next_token = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!embd.empty() && embd.back() == llama_token_eos()) {
|
||||
has_next_token = false;
|
||||
}
|
||||
|
||||
if (params.interactive && n_remain <= 0 && params.n_predict != -1)
|
||||
{
|
||||
n_remain = params.n_predict;
|
||||
}
|
||||
has_next_token = n_remain != 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string doCompletion()
|
||||
{
|
||||
llama_token token = nextToken();
|
||||
if (token == -1) {
|
||||
return "";
|
||||
}
|
||||
tokens_predicted.clear();
|
||||
tokens_predicted.push_back(token);
|
||||
|
||||
// Avoid add the no show words to the response
|
||||
for (std::vector<llama_token> word_tokens : no_show_words)
|
||||
{
|
||||
int match_token = 1;
|
||||
if (tokens_predicted.front() == word_tokens.front())
|
||||
{
|
||||
bool execute_matching = true;
|
||||
if (tokens_predicted.size() > 1) { // if previus tokens had been tested
|
||||
for (int i = 1; i < word_tokens.size(); i++)
|
||||
{
|
||||
if (i >= tokens_predicted.size()) {
|
||||
match_token = i;
|
||||
break;
|
||||
}
|
||||
if (tokens_predicted[i] == word_tokens[i])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
execute_matching = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
while (execute_matching) {
|
||||
if (match_token == word_tokens.size()) {
|
||||
return "";
|
||||
}
|
||||
token = nextToken();
|
||||
tokens_predicted.push_back(token);
|
||||
if (token == word_tokens[match_token])
|
||||
{ // the token follow the sequence
|
||||
match_token++;
|
||||
}
|
||||
else if (match_token < word_tokens.size())
|
||||
{ // no complete all word sequence
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if(as_loop) {
|
||||
generated_text = "";
|
||||
}
|
||||
for (llama_token tkn : tokens_predicted)
|
||||
{
|
||||
generated_text += llama_token_to_str(ctx, tkn);
|
||||
}
|
||||
return generated_text;
|
||||
}
|
||||
|
||||
std::vector<float> embedding(std::string content, int threads) {
|
||||
content.insert(0, 1, ' ');
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, content, true);
|
||||
if (tokens.size() > 0)
|
||||
{
|
||||
if (llama_eval(ctx, tokens.data(), tokens.size(), 0, threads))
|
||||
{
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
std::vector<float> embeddings_;
|
||||
return embeddings_;
|
||||
}
|
||||
}
|
||||
const int n_embd = llama_n_embd(ctx);
|
||||
const auto embeddings = llama_get_embeddings(ctx);
|
||||
std::vector<float> embeddings_(embeddings, embeddings + n_embd);
|
||||
return embeddings_;
|
||||
}
|
||||
};
|
||||
|
||||
using namespace httplib;
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms)
|
||||
{
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
||||
fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
|
||||
fprintf(stderr, " --embedding enable embedding mode\n");
|
||||
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||
if (llama_mlock_supported())
|
||||
{
|
||||
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
}
|
||||
if (llama_mmap_supported())
|
||||
{
|
||||
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||
}
|
||||
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
||||
fprintf(stderr, " number of layers to store in VRAM\n");
|
||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||
fprintf(stderr, " -host ip address to listen (default 127.0.0.1)\n");
|
||||
fprintf(stderr, " -port PORT port to listen (default 8080)\n");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_params ¶ms)
|
||||
{
|
||||
gpt_params default_params;
|
||||
std::string arg;
|
||||
bool invalid_param = false;
|
||||
|
||||
for (int i = 1; i < argc; i++)
|
||||
{
|
||||
arg = argv[i];
|
||||
if (arg == "--port")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.port = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--host")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.hostname = argv[i];
|
||||
}
|
||||
else if (arg == "-s" || arg == "--seed")
|
||||
{
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n");
|
||||
#endif
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.seed = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "-m" || arg == "--model")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.model = argv[i];
|
||||
}
|
||||
else if (arg == "--embedding")
|
||||
{
|
||||
params.embedding = true;
|
||||
}
|
||||
else if (arg == "-h" || arg == "--help")
|
||||
{
|
||||
server_print_usage(argc, argv, default_params);
|
||||
exit(0);
|
||||
}
|
||||
else if (arg == "-c" || arg == "--ctx_size")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--memory_f32")
|
||||
{
|
||||
params.memory_f16 = false;
|
||||
}
|
||||
else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_gpu_layers = std::stoi(argv[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
server_print_usage(argc, argv, default_params);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (invalid_param)
|
||||
{
|
||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||
server_print_usage(argc, argv, default_params);
|
||||
exit(1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool parse_options_completion(json body, llama_server_context& llama, Response &res) {
|
||||
if (!body["threads"].is_null())
|
||||
{
|
||||
llama.params.n_threads = body["threads"].get<int>();
|
||||
}
|
||||
if (!body["n_predict"].is_null())
|
||||
{
|
||||
llama.params.n_predict = body["n_predict"].get<int>();
|
||||
}
|
||||
if (!body["top_k"].is_null())
|
||||
{
|
||||
llama.params.top_k = body["top_k"].get<int>();
|
||||
}
|
||||
if (!body["top_p"].is_null())
|
||||
{
|
||||
llama.params.top_p = body["top_p"].get<float>();
|
||||
}
|
||||
if (!body["temperature"].is_null())
|
||||
{
|
||||
llama.params.temp = body["temperature"].get<float>();
|
||||
}
|
||||
if (!body["batch_size"].is_null())
|
||||
{
|
||||
llama.params.n_batch = body["batch_size"].get<int>();
|
||||
}
|
||||
if (!body["n_keep"].is_null())
|
||||
{
|
||||
llama.params.n_keep = body["n_keep"].get<int>();
|
||||
}
|
||||
if (!body["as_loop"].is_null())
|
||||
{
|
||||
llama.as_loop = body["as_loop"].get<bool>();
|
||||
}
|
||||
if (!body["interactive"].is_null())
|
||||
{
|
||||
llama.params.interactive = body["interactive"].get<bool>();
|
||||
}
|
||||
if (!body["prompt"].is_null())
|
||||
{
|
||||
llama.params.prompt = body["prompt"].get<std::string>();
|
||||
}
|
||||
else
|
||||
{
|
||||
json data = {
|
||||
{"status", "error"},
|
||||
{"reason", "You need to pass the prompt"}};
|
||||
res.set_content(data.dump(), "application/json");
|
||||
res.status = 400;
|
||||
return false;
|
||||
}
|
||||
if (!body["stop"].is_null())
|
||||
{
|
||||
std::vector<std::string> stop_words = body["stop"].get<std::vector<std::string>>();
|
||||
for (std::string stop_word : stop_words)
|
||||
{
|
||||
llama.params.antiprompt.push_back(stop_word);
|
||||
llama.no_show_words.push_back(::llama_tokenize(llama.ctx, stop_word, false));
|
||||
}
|
||||
}
|
||||
if (!body["exclude"].is_null())
|
||||
{
|
||||
std::vector<std::string> no_show_words = body["exclude"].get<std::vector<std::string>>();
|
||||
for (std::string no_show : no_show_words)
|
||||
{
|
||||
llama.no_show_words.push_back(::llama_tokenize(llama.ctx, no_show, false));
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
// own arguments required by this example
|
||||
gpt_params params;
|
||||
server_params sparams;
|
||||
|
||||
// struct that contains llama context and inference
|
||||
llama_server_context llama;
|
||||
params.model = "ggml-model.bin";
|
||||
|
||||
if (server_params_parse(argc, argv, sparams, params) == false)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.seed <= 0)
|
||||
{
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
|
||||
|
||||
// load the model
|
||||
if (!llama.loadModel(params))
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
Server svr;
|
||||
|
||||
svr.Get("/", [](const Request &req, Response &res)
|
||||
{ res.set_content("<h1>llama.cpp server works</h1>", "text/html"); });
|
||||
|
||||
svr.Post("/completion", [&llama](const Request &req, Response &res)
|
||||
{
|
||||
if(llama.params.embedding) {
|
||||
json data = {
|
||||
{"status", "error"},
|
||||
{"reason", "To use completion function disable embedding mode"}};
|
||||
res.set_content(data.dump(), "application/json");
|
||||
res.status = 400;
|
||||
return;
|
||||
}
|
||||
|
||||
llama.rewind();
|
||||
|
||||
if(parse_options_completion(json::parse(req.body), llama, res) == false){
|
||||
return;
|
||||
}
|
||||
|
||||
if (!llama.loadPrompt())
|
||||
{
|
||||
json data = {
|
||||
{"status", "error"},
|
||||
{"reason", "Context too long, please be more specific"}};
|
||||
res.set_content(data.dump(), "application/json");
|
||||
res.status = 400;
|
||||
return;
|
||||
}
|
||||
|
||||
llama.beginCompletion();
|
||||
if(llama.as_loop) {
|
||||
json data = {
|
||||
{"status", "done" } };
|
||||
return res.set_content(data.dump(), "application/json");
|
||||
} else {
|
||||
// loop inference until finish completion
|
||||
while (llama.has_next_token)
|
||||
{
|
||||
llama.doCompletion();
|
||||
}
|
||||
try
|
||||
{
|
||||
json data = {
|
||||
{"content", llama.generated_text },
|
||||
{"tokens_predicted", llama.num_tokens_predicted}};
|
||||
return res.set_content(data.dump(), "application/json");
|
||||
}
|
||||
catch (json::exception e)
|
||||
{
|
||||
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
|
||||
json data = {
|
||||
{"content", "Bad encoding token"},
|
||||
{"tokens_predicted", 0}};
|
||||
return res.set_content(data.dump(), "application/json");
|
||||
}
|
||||
} });
|
||||
|
||||
svr.Post("/tokenize", [&llama](const Request &req, Response &res)
|
||||
{
|
||||
json body = json::parse(req.body);
|
||||
json data = {
|
||||
{"tokens", ::llama_tokenize(llama.ctx, body["content"].get<std::string>(), false) } };
|
||||
return res.set_content(data.dump(), "application/json");
|
||||
});
|
||||
|
||||
svr.Post("/embedding", [&llama](const Request &req, Response &res)
|
||||
{
|
||||
if(!llama.params.embedding) {
|
||||
std::vector<float> empty;
|
||||
json data = {
|
||||
{"embedding", empty}};
|
||||
fprintf(stderr, "[llama-server] : You need enable embedding mode adding: --embedding option\n");
|
||||
return res.set_content(data.dump(), "application/json");
|
||||
}
|
||||
json body = json::parse(req.body);
|
||||
std::string content = body["content"].get<std::string>();
|
||||
int threads = body["threads"].get<int>();
|
||||
json data = {
|
||||
{"embedding", llama.embedding(content, threads) } };
|
||||
return res.set_content(data.dump(), "application/json");
|
||||
});
|
||||
|
||||
svr.Get("/next-token", [&llama](const Request &req, Response &res)
|
||||
{
|
||||
if(llama.params.embedding) {
|
||||
res.set_content("{}", "application/json");
|
||||
return;
|
||||
}
|
||||
std::string result = "";
|
||||
if (req.has_param("stop")) {
|
||||
llama.has_next_token = false;
|
||||
} else {
|
||||
result = llama.doCompletion(); // inference next token
|
||||
}
|
||||
try {
|
||||
json data = {
|
||||
{"content", result },
|
||||
{"stop", !llama.has_next_token }};
|
||||
return res.set_content(data.dump(), "application/json");
|
||||
} catch (json::exception e) {
|
||||
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
|
||||
json data = {
|
||||
{"content", "" },
|
||||
{"stop", !llama.has_next_token }};
|
||||
return res.set_content(data.dump(), "application/json");
|
||||
}
|
||||
});
|
||||
|
||||
fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port);
|
||||
|
||||
if(params.embedding) {
|
||||
fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n");
|
||||
}
|
||||
|
||||
// change hostname and port
|
||||
svr.listen(sparams.hostname, sparams.port);
|
||||
}
|
||||
361
ggml-opencl.c
361
ggml-opencl.c
@@ -1,361 +0,0 @@
|
||||
#include "ggml-opencl.h"
|
||||
|
||||
#define CL_TARGET_OPENCL_VERSION 110
|
||||
#include <clblast_c.h>
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
||||
const char * clblast_dequant = MULTILINE_QUOTE(
|
||||
|
||||
typedef uchar uint8_t;
|
||||
typedef int int32_t;
|
||||
typedef uint uint32_t;
|
||||
|
||||
constant uint QK4_0 = 32;
|
||||
struct block_q4_0
|
||||
{
|
||||
float d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
};
|
||||
|
||||
constant uint QK4_1 = 32;
|
||||
struct block_q4_1
|
||||
{
|
||||
float d;
|
||||
float m;
|
||||
uint8_t qs[QK4_1 / 2];
|
||||
};
|
||||
|
||||
constant uint QK5_0 = 32;
|
||||
struct __attribute__ ((packed)) block_q5_0
|
||||
{
|
||||
half d;
|
||||
uint32_t qh;
|
||||
uint8_t qs[QK5_0 / 2];
|
||||
};
|
||||
|
||||
constant uint QK5_1 = 32;
|
||||
struct block_q5_1
|
||||
{
|
||||
half d;
|
||||
half m;
|
||||
uint32_t qh;
|
||||
uint8_t qs[QK5_1 / 2];
|
||||
};
|
||||
|
||||
constant uint QK8_0 = 32;
|
||||
struct block_q8_0
|
||||
{
|
||||
float d;
|
||||
uint8_t qs[QK8_0];
|
||||
};
|
||||
|
||||
|
||||
__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
|
||||
constant uint qk = QK4_0;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
const int x0 = (x[i].qs[j] & 0xf) - 8;
|
||||
const int x1 = (x[i].qs[j] >> 4) - 8;
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d;
|
||||
y[i*qk + j + qk/2] = x1*d;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
|
||||
constant uint qk = QK4_1;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = x[i].d;
|
||||
const float m = x[i].m;
|
||||
|
||||
const int x0 = (x[i].qs[j] & 0xf);
|
||||
const int x1 = (x[i].qs[j] >> 4);
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d + m;
|
||||
y[i*qk + j + qk/2] = x1*d + m;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
|
||||
constant uint qk = QK5_0;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = vload_half(0, (__global half*) &x[i].d);
|
||||
|
||||
uint32_t qh = x[i].qh;
|
||||
|
||||
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
||||
|
||||
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
|
||||
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d;
|
||||
y[i*qk + j + qk/2] = x1*d;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
|
||||
constant uint qk = QK5_1;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = vload_half(0, (__global half*) &x[i].d);
|
||||
const float m = vload_half(0, (__global half*) &x[i].m);
|
||||
|
||||
uint32_t qh = x[i].qh;
|
||||
|
||||
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
||||
|
||||
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
|
||||
const int x1 = (x[i].qs[j] >> 4) | xh_1;
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d + m;
|
||||
y[i*qk + j + qk/2] = x1*d + m;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
|
||||
constant uint qk = QK8_0;
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = x[i].d;
|
||||
y[i*qk + j] = x[i].qs[j]*d;
|
||||
}
|
||||
|
||||
);
|
||||
|
||||
#define CL_CHECK(err, name) \
|
||||
do { \
|
||||
cl_int err_ = (err); \
|
||||
if (err_ != CL_SUCCESS) { \
|
||||
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static cl_platform_id platform;
|
||||
static cl_device_id device;
|
||||
static cl_context context;
|
||||
static cl_command_queue queue;
|
||||
static cl_program program;
|
||||
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
|
||||
static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
|
||||
static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
|
||||
|
||||
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
|
||||
cl_program p;
|
||||
char *program_log;
|
||||
size_t program_size, log_size;
|
||||
int err;
|
||||
|
||||
program_size = strlen(program_buffer);
|
||||
|
||||
p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
|
||||
if(err < 0) {
|
||||
fprintf(stderr, "OpenCL error creating program");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
|
||||
if(err < 0) {
|
||||
|
||||
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
|
||||
program_log = (char*) malloc(log_size + 1);
|
||||
program_log[log_size] = '\0';
|
||||
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
|
||||
printf("%s\n", program_log);
|
||||
free(program_log);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
void ggml_cl_init(void) {
|
||||
cl_int err = 0;
|
||||
char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
|
||||
char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
|
||||
int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
|
||||
int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
|
||||
printf("\nInitializing CLBlast (First Run)...");
|
||||
printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
|
||||
cl_uint num_platforms;
|
||||
clGetPlatformIDs(0, NULL, &num_platforms);
|
||||
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
|
||||
clGetPlatformIDs(num_platforms, platforms, NULL);
|
||||
platform = platforms[plat_num];
|
||||
char platform_buffer[1024];
|
||||
clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
|
||||
cl_uint num_devices;
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
|
||||
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
|
||||
device = devices[dev_num];
|
||||
char device_buffer[1024];
|
||||
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
|
||||
printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
|
||||
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
|
||||
CL_CHECK(err, "clCreateContext");
|
||||
queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
|
||||
CL_CHECK(err, "clCreateCommandQueue");
|
||||
|
||||
free(platforms);
|
||||
free(devices);
|
||||
|
||||
program = build_program_from_source(context, device, clblast_dequant);
|
||||
|
||||
// Prepare dequantize kernels
|
||||
kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
}
|
||||
|
||||
static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
|
||||
if (req_size <= *cur_size) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Reallocate buffer with enough space
|
||||
if (*cur_size > 0) {
|
||||
clReleaseMemObject(*buf);
|
||||
}
|
||||
cl_int err;
|
||||
*buf = clCreateBuffer(context, flags, req_size, NULL, &err);
|
||||
*cur_size = req_size;
|
||||
CL_CHECK(err, "clCreateBuffer");
|
||||
}
|
||||
|
||||
void ggml_cl_sgemm_wrapper(
|
||||
const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
|
||||
const int m, const int n, const int k,
|
||||
const float alpha, const void *host_a, const int lda,
|
||||
const float *host_b, const int ldb, const float beta,
|
||||
float *host_c, const int ldc, const int btype) {
|
||||
cl_int err = 0;
|
||||
|
||||
cl_kernel kernel;
|
||||
size_t global = n * k, local, size_qb;
|
||||
bool dequant;
|
||||
|
||||
switch (btype) {
|
||||
case GGML_TYPE_F32:
|
||||
dequant = false;
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
dequant = true;
|
||||
kernel = kernel_q4_0;
|
||||
local = 16;
|
||||
size_qb = global * (sizeof(float) + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
dequant = true;
|
||||
kernel = kernel_q4_1;
|
||||
local = 16;
|
||||
size_qb = global * (sizeof(float) * 2 + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
dequant = true;
|
||||
kernel = kernel_q5_0;
|
||||
local = 16;
|
||||
size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
dequant = true;
|
||||
kernel = kernel_q5_1;
|
||||
local = 16;
|
||||
size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
dequant = true;
|
||||
kernel = kernel_q8_0;
|
||||
local = 32;
|
||||
size_qb = global * (sizeof(float) + local) / 32;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
|
||||
abort();
|
||||
}
|
||||
|
||||
const size_t size_a = m * k * sizeof(float);
|
||||
const size_t size_b = n * k * sizeof(float);
|
||||
const size_t size_c = m * n * sizeof(float);
|
||||
|
||||
// Prepare buffers
|
||||
ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
|
||||
if (dequant) {
|
||||
ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
|
||||
}
|
||||
ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
|
||||
ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
|
||||
|
||||
cl_event ev_a, ev_qb, ev_b;
|
||||
|
||||
if (dequant) {
|
||||
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
|
||||
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
|
||||
CL_CHECK(err, "clSetKernelArg");
|
||||
err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
|
||||
CL_CHECK(err, "clEnqueueWriteBuffer qb");
|
||||
} else {
|
||||
err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
|
||||
CL_CHECK(err, "clEnqueueWriteBuffer b");
|
||||
}
|
||||
|
||||
err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
|
||||
CL_CHECK(err, "clEnqueueWriteBuffer a");
|
||||
if (dequant) {
|
||||
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
|
||||
CL_CHECK(err, "clEnqueueNDRangeKernel");
|
||||
clReleaseEvent(ev_qb);
|
||||
}
|
||||
clWaitForEvents(1, &ev_a);
|
||||
clWaitForEvents(1, &ev_b);
|
||||
clReleaseEvent(ev_a);
|
||||
clReleaseEvent(ev_b);
|
||||
|
||||
cl_event ev_sgemm;
|
||||
CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
|
||||
(CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
|
||||
m, n, k,
|
||||
alpha,
|
||||
cl_buffer_a, 0, lda,
|
||||
cl_buffer_b, 0, ldb,
|
||||
beta,
|
||||
cl_buffer_c, 0, ldc,
|
||||
&queue, &ev_sgemm);
|
||||
|
||||
if (status != CLBlastSuccess) {
|
||||
fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
|
||||
abort();
|
||||
}
|
||||
|
||||
cl_event ev_c;
|
||||
clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
|
||||
|
||||
// Wait for completion
|
||||
clWaitForEvents(1, &ev_c);
|
||||
clReleaseEvent(ev_sgemm);
|
||||
clReleaseEvent(ev_c);
|
||||
}
|
||||
1034
ggml-opencl.cpp
Normal file
1034
ggml-opencl.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,23 +1,21 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_cl_init(void);
|
||||
|
||||
enum ggml_blas_order {
|
||||
GGML_BLAS_ORDER_ROW_MAJOR = 101,
|
||||
GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
|
||||
};
|
||||
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||
|
||||
enum ggml_blas_op {
|
||||
GGML_BLAS_OP_N = 111,
|
||||
GGML_BLAS_OP_T = 112,
|
||||
GGML_BLAS_OP_C = 113,
|
||||
};
|
||||
void * ggml_cl_host_malloc(size_t size);
|
||||
void ggml_cl_host_free(void * ptr);
|
||||
|
||||
void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
|
||||
void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
192
ggml.c
192
ggml.c
@@ -740,19 +740,19 @@ inline static float vaddvq_f32(float32x4_t v) {
|
||||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
||||
}
|
||||
|
||||
float vminvq_f32(float32x4_t v) {
|
||||
inline static float vminvq_f32(float32x4_t v) {
|
||||
return
|
||||
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||
}
|
||||
|
||||
float vmaxvq_f32(float32x4_t v) {
|
||||
inline static float vmaxvq_f32(float32x4_t v) {
|
||||
return
|
||||
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||
}
|
||||
|
||||
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||
int32x4_t res;
|
||||
|
||||
res[0] = roundf(vgetq_lane_f32(v, 0));
|
||||
@@ -766,7 +766,6 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#define QK4_0 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
@@ -1056,6 +1055,39 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
|
||||
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
|
||||
}
|
||||
}
|
||||
#elif defined(__wasm_simd128__)
|
||||
for (int i = 0; i < nb; i++) {
|
||||
v128_t srcv [8];
|
||||
v128_t asrcv[8];
|
||||
v128_t amaxv[8];
|
||||
|
||||
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
|
||||
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
|
||||
|
||||
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
|
||||
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
|
||||
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
|
||||
|
||||
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
|
||||
wasm_f32x4_extract_lane(amaxv[0], 1)),
|
||||
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
|
||||
wasm_f32x4_extract_lane(amaxv[0], 3)));
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
|
||||
for (int j = 0; j < 8; j++) {
|
||||
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
||||
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
|
||||
|
||||
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
|
||||
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
|
||||
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
|
||||
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
|
||||
}
|
||||
}
|
||||
#elif defined(__AVX2__) || defined(__AVX__)
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// Load elements into 4 AVX vectors
|
||||
@@ -1224,6 +1256,48 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
|
||||
|
||||
y[i].s = d * vaddvq_s32(accv);
|
||||
}
|
||||
#elif defined(__wasm_simd128__)
|
||||
for (int i = 0; i < nb; i++) {
|
||||
v128_t srcv [8];
|
||||
v128_t asrcv[8];
|
||||
v128_t amaxv[8];
|
||||
|
||||
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
|
||||
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
|
||||
|
||||
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
|
||||
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
|
||||
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
|
||||
|
||||
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
|
||||
wasm_f32x4_extract_lane(amaxv[0], 1)),
|
||||
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
|
||||
wasm_f32x4_extract_lane(amaxv[0], 3)));
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = d;
|
||||
|
||||
v128_t accv = wasm_i32x4_splat(0);
|
||||
|
||||
for (int j = 0; j < 8; j++) {
|
||||
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
||||
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
|
||||
|
||||
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
|
||||
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
|
||||
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
|
||||
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
|
||||
|
||||
accv = wasm_i32x4_add(accv, vi);
|
||||
}
|
||||
|
||||
y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
|
||||
wasm_i32x4_extract_lane(accv, 1) +
|
||||
wasm_i32x4_extract_lane(accv, 2) +
|
||||
wasm_i32x4_extract_lane(accv, 3));
|
||||
}
|
||||
#elif defined(__AVX2__) || defined(__AVX__)
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// Load elements into 4 AVX vectors
|
||||
@@ -2598,7 +2672,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
||||
const block_q8_0 * restrict y0 = &y[i];
|
||||
|
||||
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
||||
const v128_t s16b = wasm_i8x16_splat(0x10);
|
||||
|
||||
// extract the 5th bit
|
||||
memcpy(&qh, x0->qh, sizeof(qh));
|
||||
@@ -2636,15 +2709,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
||||
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
||||
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
||||
|
||||
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
||||
|
||||
// dot product
|
||||
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
||||
wasm_i32x4_add(
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
||||
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
||||
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
|
||||
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
||||
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
|
||||
}
|
||||
|
||||
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||
@@ -2868,8 +2940,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
||||
const v128_t v0l = wasm_v128_and (v0, m4b);
|
||||
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
||||
|
||||
static bool x = true;
|
||||
|
||||
// add high bit
|
||||
const v128_t v0lf = wasm_v128_or(v0l, qhl);
|
||||
const v128_t v0hf = wasm_v128_or(v0h, qhh);
|
||||
@@ -2892,11 +2962,11 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
||||
// dot product
|
||||
sumv = wasm_f32x4_add(sumv,
|
||||
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
||||
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
||||
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
||||
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d));
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
||||
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
||||
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
||||
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
|
||||
}
|
||||
|
||||
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||
@@ -9361,7 +9431,7 @@ static void ggml_compute_forward_rms_norm_back(
|
||||
|
||||
// ggml_compute_forward_mul_mat
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
// helper function to determine if it is better to use BLAS or not
|
||||
// for large matrices, BLAS is faster
|
||||
static bool ggml_compute_forward_mul_mat_use_blas(
|
||||
@@ -9402,7 +9472,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
#endif
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
@@ -9466,9 +9536,16 @@ static void ggml_compute_forward_mul_mat_f32(
|
||||
}
|
||||
return;
|
||||
}
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
||||
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
||||
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
@@ -9488,21 +9565,11 @@ static void ggml_compute_forward_mul_mat_f32(
|
||||
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
// zT = y * xT
|
||||
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne10,
|
||||
0.0f, d, ne01,
|
||||
GGML_TYPE_F32);
|
||||
#else
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
||||
@@ -9641,9 +9708,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
||||
}
|
||||
return;
|
||||
}
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
||||
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
||||
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
@@ -9673,20 +9747,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
||||
assert(id*sizeof(float) <= params->wsize);
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
const float * x = wdata;
|
||||
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
// zT = y * xT
|
||||
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne10,
|
||||
0.0f, d, ne01,
|
||||
GGML_TYPE_F32);
|
||||
#else
|
||||
const float * x = wdata;
|
||||
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
||||
|
||||
@@ -9698,7 +9758,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
||||
1.0f, y, ne10,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9861,9 +9920,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
||||
}
|
||||
return;
|
||||
}
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
||||
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
||||
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
@@ -9886,9 +9952,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
|
||||
#else
|
||||
{
|
||||
size_t id = 0;
|
||||
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
||||
@@ -9900,23 +9963,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
||||
}
|
||||
|
||||
const float * x = wdata;
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
// zT = y * xT
|
||||
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne10,
|
||||
0.0f, d, ne01,
|
||||
type);
|
||||
#else
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14095,9 +14147,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
|
||||
}
|
||||
else
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
||||
// the threads are still spinning
|
||||
cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
||||
// the threads are still spinning
|
||||
@@ -14111,13 +14170,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
#endif
|
||||
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
||||
cur = 0;
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1;
|
||||
}
|
||||
#endif
|
||||
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1;
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
||||
@@ -14561,9 +14620,12 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||
fprintf(fp, "%s |", node->name);
|
||||
}
|
||||
|
||||
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s",
|
||||
i, node->ne[0], node->ne[1],
|
||||
GGML_OP_SYMBOL[node->op]);
|
||||
if (node->n_dims == 2) {
|
||||
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
|
||||
} else {
|
||||
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
|
||||
}
|
||||
|
||||
|
||||
if (node->grad) {
|
||||
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
|
||||
|
||||
1
ggml.h
1
ggml.h
@@ -249,6 +249,7 @@ extern "C" {
|
||||
enum ggml_backend {
|
||||
GGML_BACKEND_CPU = 0,
|
||||
GGML_BACKEND_CUDA = 1,
|
||||
GGML_BACKEND_CL = 2,
|
||||
};
|
||||
|
||||
// model file types
|
||||
|
||||
72
llama.cpp
72
llama.cpp
@@ -12,6 +12,8 @@
|
||||
#include "ggml.h"
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
#include "ggml-opencl.h"
|
||||
#endif
|
||||
|
||||
#include <array>
|
||||
@@ -427,26 +429,30 @@ struct llama_file_loader {
|
||||
}
|
||||
void read_magic() {
|
||||
uint32_t magic = file.read_u32();
|
||||
uint32_t version = 0;
|
||||
|
||||
if (magic != 'ggml') {
|
||||
version = file.read_u32();
|
||||
}
|
||||
|
||||
if (magic == 'ggml' && version == 0) {
|
||||
if (magic == LLAMA_FILE_MAGIC_GGML) {
|
||||
file_version = LLAMA_FILE_VERSION_GGML;
|
||||
} else if (magic == 'ggmf' && version == 1) {
|
||||
file_version = LLAMA_FILE_VERSION_GGMF_V1;
|
||||
} else if (magic == 'ggjt' && version == 1) {
|
||||
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
||||
} else if (magic == 'ggjt' && version == 2) {
|
||||
file_version = LLAMA_FILE_VERSION_GGJT_V2;
|
||||
} else if (magic == 'ggjt' && version == 3) {
|
||||
file_version = LLAMA_FILE_VERSION_GGJT_V3;
|
||||
} else {
|
||||
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
||||
magic, version);
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t version = file.read_u32();
|
||||
|
||||
switch (magic) {
|
||||
case LLAMA_FILE_MAGIC_GGMF:
|
||||
switch (version) {
|
||||
case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
|
||||
}
|
||||
break;
|
||||
case LLAMA_FILE_MAGIC_GGJT:
|
||||
switch (version) {
|
||||
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
|
||||
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
|
||||
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
|
||||
}
|
||||
}
|
||||
|
||||
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
||||
magic, version);
|
||||
}
|
||||
void read_hparams() {
|
||||
hparams.n_vocab = file.read_u32();
|
||||
@@ -1088,7 +1094,7 @@ static void llama_model_load_internal(
|
||||
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
||||
}
|
||||
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||
#else
|
||||
#elif !defined(GGML_USE_CLBLAST)
|
||||
(void) n_gpu_layers;
|
||||
#endif
|
||||
}
|
||||
@@ -1121,7 +1127,33 @@ static void llama_model_load_internal(
|
||||
done_size += lt.size;
|
||||
}
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
{
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
||||
fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
|
||||
|
||||
size_t vram_total = 0;
|
||||
|
||||
for (int i = 0; i < n_gpu; ++i) {
|
||||
const auto & layer = model.layers[i];
|
||||
|
||||
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
||||
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
||||
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
||||
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
||||
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
||||
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
||||
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
||||
}
|
||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
||||
fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
|
||||
ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
||||
}
|
||||
|
||||
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (progress_callback) {
|
||||
progress_callback(1.0f, progress_callback_user_data);
|
||||
@@ -2290,7 +2322,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||
{
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic != 'ggla') {
|
||||
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
||||
fprintf(stderr, "%s: bad file magic\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
12
llama.h
12
llama.h
@@ -19,10 +19,16 @@
|
||||
# define LLAMA_API
|
||||
#endif
|
||||
|
||||
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
||||
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
|
||||
#define LLAMA_FILE_VERSION 3
|
||||
#define LLAMA_FILE_MAGIC 'ggjt'
|
||||
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
|
||||
#define LLAMA_SESSION_MAGIC 'ggsn'
|
||||
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
||||
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Reference in New Issue
Block a user