Compare commits

...

10 Commits

Author SHA1 Message Date
Ivan Stepanov
0c44427df1 make : missing host optimizations in CXXFLAGS (#763) 2023-04-05 17:38:37 +03:00
Adithya Balaji
594cc95fab readme : update with CMake and windows example (#748)
* README: Update with CMake and windows example

* README: update with code-review for cmake build
2023-04-05 17:36:12 +03:00
at8u
88ed5761b8 examples : add Miku.sh (#724)
* Add Miku.sh to examples

* Add missing line to prompt in Miku.sh

* Add --keep param to Miku.sh

* Remove '[end_of_conversation]' line from Miku.sh

No longer is necessary.
2023-04-05 17:32:42 +03:00
Andrew Duffy
58c438cf7d Add Accelerate/BLAS when using Swift (#765) 2023-04-05 06:44:24 -04:00
mgroeber9110
53dbba7695 Windows: reactive sigint handler after each Ctrl-C (#736) 2023-04-03 18:00:55 +02:00
SebastianApel
437e77855a 10+% performance improvement of ggml_vec_dot_q4_0 on AVX2 (#654)
* Performance improvement of AVX2 code
* Fixed problem with MSVC compiler
* Reviewer comments: removed double semicolon, deleted empty line 1962
2023-04-03 09:52:28 +02:00
Ivan Stepanov
cd7fa95690 Define non-positive temperature behavior (#720) 2023-04-03 02:19:04 +02:00
bsilvereagle
a0c0516416 Remove torch GPU dependencies from the Docker.full image (#665)
By using `pip install torch --index-url https://download.pytorch.org/whl/cpu`
instead of `pip install torch` we can specify we want to install a CPU-only version
of PyTorch without any GPU dependencies. This reduces the size of the Docker image
from 7.32 GB to 1.62 GB
2023-04-03 00:13:03 +02:00
Thatcher Chamberlin
d8d4e865cd Add a missing step to the gpt4all instructions (#690)
`migrate-ggml-2023-03-30-pr613.py` is needed to get gpt4all running.
2023-04-02 12:48:57 +02:00
Christian Falch
e986f94829 Added api for getting/setting the kv_cache (#685)
The api provides access methods for retrieving the current memory buffer for the kv_cache and its token number.
It also contains a method for setting the kv_cache from a memory buffer.

This makes it possible to load/save history - maybe support --cache-prompt paramater as well?

Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
2023-04-02 12:23:04 +02:00
9 changed files with 187 additions and 32 deletions

View File

@@ -6,7 +6,8 @@ RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip
RUN pip install --upgrade pip setuptools wheel \
&& pip install numpy requests sentencepiece torch tqdm
&& pip install numpy requests sentencepiece tqdm \
&& pip install torch --index-url https://download.pytorch.org/whl/cpu
WORKDIR /app

View File

@@ -72,6 +72,7 @@ endif
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
# Use all CPU extensions that are available:
CFLAGS += -march=native -mtune=native
CXXFLAGS += -march=native -mtune=native
endif
ifneq ($(filter ppc64%,$(UNAME_M)),)
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)

View File

@@ -13,7 +13,10 @@ let package = Package(
path: ".",
sources: ["ggml.c", "llama.cpp"],
publicHeadersPath: "spm-headers",
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"])]
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
linkerSettings: [
.linkedFramework("Accelerate")
]
),
],
cxxLanguageStandard: .cxx11

View File

@@ -145,6 +145,13 @@ git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
make
#For Windows and CMake, use the following command instead:
cd <path_to_llama_folder>
mkdir build
cd build
cmake ..
cmake --build . --config Release
# obtain the original LLaMA model weights and place them in ./models
ls ./models
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
@@ -232,13 +239,15 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
- Obtain the `gpt4all-lora-quantized.bin` model
- It is distributed in the old `ggml` format which is now obsoleted
- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py):
- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to
convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
```bash
python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
```
- You can now use the newly generated `gpt4all-lora-quantized.bin` model in exactly the same way as all other models
- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
- The original model is saved in the same folder with a suffix `.orig`
### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data

49
examples/Miku.sh Normal file
View File

@@ -0,0 +1,49 @@
#!/bin/bash
set -e
AI_NAME="${AI_NAME:-Miku}"
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
USER_NAME="${USER_NAME:-Anon}"
# Uncomment and adjust to the number of CPU cores you want to use.
#N_THREAD="${N_THREAD:-4}"
N_PREDICTS="${N_PREDICTS:-4096}"
GEN_OPTIONS=(--batch_size 1024
--ctx_size 2048
--keep -1
--repeat_last_n 256
--repeat_penalty 1.17647
--temp 0.7
--top_k 40
--top_p 0.5)
if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD")
fi
./main "${GEN_OPTIONS[@]}" \
--model "$MODEL" \
--n_predict "$N_PREDICTS" \
--color --interactive \
--reverse-prompt "${USER_NAME}:" \
--prompt "
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
${AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
The conversation is only between ${USER_NAME} and ${AI_NAME}
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
${AI_NAME} can only communicate through text, so she can't send images or videos.
${USER_NAME}: Hello!
${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
${AI_NAME}: What do you like to do in your free time? ^_^
${USER_NAME}:" "$@"

View File

@@ -368,6 +368,11 @@ int main(int argc, char ** argv) {
// potentially set color to indicate we are taking user input
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
#if defined (_WIN32)
// Windows: must reactivate sigint handler after each signal
signal(SIGINT, sigint_handler);
#endif
if (params.instruct) {
printf("\n> ");
}

85
ggml.c
View File

@@ -1962,42 +1962,71 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
/* Prepare the constants we will need during execution */
const __m256i lowMask = _mm256_set1_epi8( 0xF );
const __m256i offset_8 = _mm256_set1_epi16( 8 );
#define UNROLL_COUNT 8
// make sure we only unroll multiples of the block count
assert(nb % UNROLL_COUNT == 0);
// Main loop
// TODO: figure a way to do this in a portable way
#ifdef __GNUC__
#pragma GCC unroll 16
#endif
for (int i = 0; i < nb; ++i) {
// Compute combined scale for the block
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
for (int i = 0; i < nb; i+=UNROLL_COUNT) {
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
__m256i bx = bytesFromNibbles( x[i].qs );
__m256i by = bytesFromNibbles( y[i].qs );
// This loop will be unrolled by the compiler
for (int u=0;u<UNROLL_COUNT;u++) {
/* Compute combined scale for the block */
const __m256 scale = _mm256_mul_ps(
_mm256_broadcast_ss( &x[i+u].d ),
_mm256_broadcast_ss( &y[i+u].d ) );
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
const __m256i off = _mm256_set1_epi8( 8 );
bx = _mm256_sub_epi8( bx, off );
by = _mm256_sub_epi8( by, off );
/* get input from x
Input: 32 Nibbles (16 bytes) at *x[i+u]
Output: 2 vectors with 16 values of type int16_t (x_high_q, x_low_q) */
/* Load 16 bytes from memory */
const __m128i tmp_x = _mm_loadu_si128( ( const __m128i* ) x[i+u].qs);
/* Expand bytes into uint16_t values */
const __m256i bytes_x = _mm256_cvtepu8_epi16(tmp_x);
/* Unpack values into individual bytes */
__m256i x_low_q = _mm256_and_si256( lowMask, bytes_x );
const __m256i pre_shift_x_high_q = _mm256_andnot_si256( lowMask, bytes_x );
__m256i x_high_q = _mm256_srli_epi16( pre_shift_x_high_q, 4 );
/* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
x_high_q = _mm256_sub_epi16( x_high_q, offset_8 );
x_low_q = _mm256_sub_epi16( x_low_q, offset_8 );
// Get absolute values of x vectors
const __m256i ax = _mm256_sign_epi8(bx, bx);
/* get input from y
Input: 32 Nibbles (16 bytes) at *y[i+u]
Output: 2 vectors with 16 values of type int16_t (y_high_q, y_low_q) */
// Sign the values of the y vectors
const __m256i sy = _mm256_sign_epi8(by, bx);
/* Load 16 bytes from memory */
const __m128i tmp_y = _mm_loadu_si128( (const __m128i* ) y[i+u].qs);
/* Expand bytes into uint16_t values */
const __m256i bytes_y = _mm256_cvtepu8_epi16(tmp_y);
/* Unpack values into individual bytes */
const __m256i pre_shift_y_high_q = _mm256_andnot_si256( lowMask, bytes_y );
__m256i y_high_q = _mm256_srli_epi16( pre_shift_y_high_q, 4 );
__m256i y_low_q = _mm256_and_si256( lowMask, bytes_y );
/* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
y_high_q = _mm256_sub_epi16( y_high_q, offset_8 );
y_low_q = _mm256_sub_epi16( y_low_q, offset_8 );
// Perform multiplication and create 16-bit values
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
/* Compute products of int16_t integers, add pairwise, store as int32_t */
__m256i xy_high_q = _mm256_madd_epi16( x_high_q, y_high_q );
__m256i xy_low_q = _mm256_madd_epi16( x_low_q, y_low_q );
const __m256i ones = _mm256_set1_epi16(1);
const __m256i i32 = _mm256_madd_epi16(ones, dot);
/* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */
__m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q );
// Convert int32_t to float
const __m256 p = _mm256_cvtepi32_ps( i32 );
/* Convert to vectore of 8 int32_t to 8 floats */
__m256 q = _mm256_cvtepi32_ps( xy_q );
// Apply the scale, and accumulate
acc = _mm256_fmadd_ps( d, p, acc );
}
/* Multiply q with scale and accumulate */
acc = _mm256_fmadd_ps( scale, q, acc );
}
}
// Return horizontal sum of the acc vector
__m128 res = _mm256_extractf128_ps( acc, 1 );
@@ -2026,7 +2055,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
bx = _mm_sub_epi8( bx, off );
by = _mm_sub_epi8( by, off );
// Get absolute values of x vectors
// Get absolute values of x vectors
const __m128i ax = _mm_sign_epi8(bx, bx);
// Sign the values of the y vectors

View File

@@ -1194,6 +1194,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
const auto & logits = lctx.logits;
const auto * plogits = logits.data() + logits.size() - n_logits;
if (temp <= 0) {
// select the token with the highest logit directly
float max_logit = plogits[0];
llama_vocab::id max_id = 0;
for (int i = 1; i < n_logits; ++i) {
if (plogits[i] > max_logit) {
max_logit = plogits[i];
max_id = i;
}
}
return max_id;
}
std::vector<std::pair<float, llama_vocab::id>> logits_id;
logits_id.reserve(n_logits);
@@ -1668,6 +1682,33 @@ int llama_model_quantize(
return 0;
}
// Returns the KV cache that will contain the context for the
// ongoing prediction with the model.
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
return ctx->model.kv_self.buf.data();
}
// Returns the size of the KV cache
size_t llama_get_kv_cache_size(struct llama_context * ctx) {
return ctx->model.kv_self.buf.size();
}
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
return ctx->model.kv_self.n;
}
// Sets the KV cache containing the current context for the model
void llama_set_kv_cache(
struct llama_context * ctx,
const uint8_t * kv_cache,
size_t n_size,
int n_token_count) {
// Make sure we have the same kv cache setup
LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
ctx->model.kv_self.n = n_token_count;
}
int llama_eval(
struct llama_context * ctx,
const llama_token * tokens,

17
llama.h
View File

@@ -83,6 +83,23 @@ extern "C" {
const char * fname_out,
int itype);
// Returns the KV cache that will contain the context for the
// ongoing prediction with the model.
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
// Returns the size of the KV cache
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
// Returns the number of tokens in the KV cache
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
// Sets the KV cache containing the current context for the model
LLAMA_API void llama_set_kv_cache(
struct llama_context * ctx,
const uint8_t * kv_cache,
size_t n_size,
int n_token_count);
// Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process
// n_past is the number of tokens to use from previous eval calls