Compare commits

..

20 Commits

Author SHA1 Message Date
Stephan Walter
939ad2d3a5 Fix undefined variables in debug build, remove unused variables (#531) 2023-03-26 15:34:02 +00:00
Juan Calderon-Perez
8c2ec5e21d Add support for linux/arm64 platform during Docker Builds (#514)
* Add support for linux/arm64 platform

* Add platform to versioned builds
2023-03-26 14:48:42 +00:00
Stephan Walter
b391579db9 Update README and comments for standalone perplexity tool (#525) 2023-03-26 16:14:01 +03:00
anzz1
7a87d31f4f [main] fix infinite generation (-n == -1) (#523) 2023-03-26 16:06:10 +03:00
Georgi Gerganov
348d6926ee Add logo to README.md 2023-03-26 10:20:49 +03:00
Harald Fernengel
33e35b8fe8 Exit from interactive mode if input stream is bad (#491)
Allow exiting the interactive prompt also with CTRL-D on Unix and CTRL-Z
on Windows.
2023-03-26 08:25:46 +03:00
anzz1
19726169b3 CI: Run other sanitizer builds even if one fails (#511)
applies only to sanitizer builds so they wont be cancelled
2023-03-26 00:13:28 +02:00
jp-x-g
f732695cd5 Clarify console output in convert-pth-to-ggml.py (#512)
"Processing part 1 of 3" instead of "Processing part 0"
2023-03-25 23:53:55 +02:00
anzz1
2f7bf7dd7c CMake / CI additions (#497)
* CMake: Add AVX512 option

* CI: Add AVX/AVX512 builds (Windows)
(AVX512 tests can only be run when the worker happens to support it, building works anyway)

* CMake: Fix sanitizer linkage ( merged #468 )

* CI: Add sanitizer builds (Ubuntu)

* CI: Fix release tagging
(change @zendesk/action-create-release to @anzz1/action-create-release until upstream PR Added commitish as input zendesk/action-create-release#32 is merged)
2023-03-25 23:38:11 +02:00
anzz1
34ab526843 (Windows) Set console to UTF-8 on init (#420)
Sets console codepage to 65001 (CP_UTF8) on start for both input and output, should fix problems with UTF-8 characters.
2023-03-25 22:29:22 +02:00
Georgi Gerganov
c2b25b6912 Fix colors enabling on WIN32 2023-03-25 21:53:39 +02:00
Georgi Gerganov
79b2b266db If n_predict == -1, generate forever 2023-03-25 21:51:41 +02:00
Georgi Gerganov
e2d490dafd Inifinite generation via context swapping (#71) 2023-03-25 21:36:22 +02:00
Georgi Gerganov
03f7e33560 Cleanup STL headers + fix embedding examples + minor stuff 2023-03-25 20:51:14 +02:00
Georgi Gerganov
55ad42af84 Move chat scripts into "./examples" 2023-03-25 20:37:09 +02:00
slaren
459e93cce0 Add AVX2 implementation of dequantize_row_q4_1 (#505) 2023-03-25 20:31:48 +02:00
Georgi Gerganov
a316a425d0 Overhaul the examples structure
- main -> examples
- utils -> examples (renamed to "common")
- quantize -> examples
- separate tools for "perplexity" and "embedding"

Hope I didn't break something !
2023-03-25 20:26:40 +02:00
Georgi Gerganov
ecbe466a36 Retire the ggml_mul_mat() branch for transposed src0 (#500)
* Retire the ggml_mul_mat() for transposed src0

- It can always be made contiguous with ggml_cpy()
- The code is now simplified
- The results are deterministic in respect to num threads

* SIMD-ify dequantize_row_q4_0() for ARM_NEON (#502)

* Attempt to SIMD-ify dequantize_row_q4_0() for ARM_NEON

* Fix dequantization - forgot to interleave the quants
2023-03-25 19:47:21 +02:00
Georgi Gerganov
502a400192 Disable prompt verbosity by default and add option to enable (#480) 2023-03-25 17:17:16 +02:00
slaren
09aecbf628 Add AVX2 implementation of dequantize_row_q4_0 (#467) 2023-03-25 17:06:49 +02:00
31 changed files with 982 additions and 1003 deletions

View File

@@ -64,6 +64,40 @@ jobs:
cd build
ctest --output-on-failure
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest
continue-on-error: true
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
cmake --build . --config Release
- name: Test
id: cmake_test
run: |
cd build
ctest --output-on-failure
macOS-latest-make:
runs-on: macos-latest
@@ -112,6 +146,16 @@ jobs:
windows-latest-cmake:
runs-on: windows-latest
strategy:
matrix:
include:
- build: 'avx2'
defines: ''
- build: 'avx'
defines: '-DLLAMA_AVX2=OFF'
- build: 'avx512'
defines: '-DLLAMA_AVX512=ON'
steps:
- name: Clone
id: checkout
@@ -122,11 +166,21 @@ jobs:
run: |
mkdir build
cd build
cmake ..
cmake .. ${{ matrix.defines }}
cmake --build . --config Release
- name: Check AVX512F support
id: check_avx512f
if: ${{ matrix.build == 'avx512' }}
continue-on-error: true
run: |
cd build
Set-Content -Path .\avx512f.exe -Value ([Convert]::FromBase64String('TVqQAAMAAAAEAAAA//8AALgAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAyAAAAA4fug4AtAnNIbgBTM0hVGhpcyBwcm9ncmFtIGNhbm5vdCBiZSBydW4gaW4gRE9TIG1vZGUuDQ0KJAAAAAAAAAClmfXY4fibi+H4m4vh+JuL4fiai+P4m4si98aL4vibi7Xbq4vg+JuLUmljaOH4m4sAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABQRQAATAEBAGo6H2QAAAAAAAAAAOAADwELAQYAAAIAAAAAAAAAAAAADBAAAAAQAAAAIAAAAABAAAAQAAAAAgAABAAAAAAAAAAEAAAAAAAAAAAgAAAAAgAAAAAAAAMAAAAAABAAABAAAAAAEAAAEAAAAAAAABAAAAAAAAAAAAAAAFQQAAAoAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAADAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC50ZXh0AAAAsgAAAAAQAAAAAgAAAAIAAAAAAAAAAAAAAAAAACAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACUEAAAiBAAAAAAAABVi+xRUVNTuAcAAAAPosHrEGaD4wGJXfxbg0X8MI1F+GoAUI1F/GoBUGr1/xUAEEAAUP8VBBBAAItF/FuDwND32BvAQMnDzMx8EAAAAAAAAAAAAACkEAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAlBAAAIgQAAAAAAAApANXcml0ZUZpbGUAuQFHZXRTdGRIYW5kbGUAAEtFUk5FTDMyLmRsbAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA==')) -AsByteStream
.\avx512f.exe && echo " AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo " AVX512F: NO"
- name: Test
id: cmake_test
if: ${{ matrix.build != 'avx512' || env.HAS_AVX512F == '1' }} # Test AVX-512 only when possible
run: |
cd build
ctest -C Release --output-on-failure
@@ -140,12 +194,39 @@ jobs:
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\bin\Release\*
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
with:
path: |
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
runs-on: ubuntu-latest
needs:
- ubuntu-latest-make
- ubuntu-latest-cmake
- macOS-latest-make
- macOS-latest-cmake
- windows-latest-cmake
steps:
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v3
- name: Get commit hash
id: commit
uses: pr-mpt/actions-commit-hash@v2
- name: Create release
id: create_release
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: zendesk/action-create-release@v1
uses: anzz1/action-create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
@@ -153,15 +234,25 @@ jobs:
- name: Upload release
id: upload_release
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
uses: actions/github-script@v3
with:
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: .\llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
asset_name: llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
asset_content_type: application/octet-stream
github-token: ${{secrets.GITHUB_TOKEN}}
script: |
const path = require('path');
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
for (let file of await fs.readdirSync('./artifact')) {
if (path.extname(file) === '.zip') {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
owner: context.repo.owner,
repo: context.repo.repo,
release_id: release_id,
name: file,
data: await fs.readFileSync(`./artifact/${file}`)
});
}
}
# ubuntu-latest-gcc:
# runs-on: ubuntu-latest

View File

@@ -49,6 +49,7 @@ jobs:
with:
context: .
push: true
platforms: linux/amd64,linux/arm64
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
@@ -57,5 +58,6 @@ jobs:
with:
context: .
push: ${{ github.event_name == 'push' }}
platforms: linux/amd64,linux/arm64
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
file: ${{ matrix.config.dockerfile }}

1
.gitignore vendored
View File

@@ -19,6 +19,7 @@ models/*
/main
/quantize
/result
/perplexity
arm_neon.h
compile_commands.json

View File

@@ -54,6 +54,7 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer"
# instruction set specific
option(LLAMA_AVX "llama: enable AVX" ON)
option(LLAMA_AVX2 "llama: enable AVX2" ON)
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_FMA "llama: enable FMA" ON)
# 3rd party libs
@@ -75,14 +76,17 @@ find_package(Threads REQUIRED)
if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD)
add_compile_options(-fsanitize=thread)
link_libraries(-fsanitize=thread)
endif()
if (LLAMA_SANITIZE_ADDRESS)
add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
link_libraries(-fsanitize=address)
endif()
if (LLAMA_SANITIZE_UNDEFINED)
add_compile_options(-fsanitize=undefined)
link_libraries(-fsanitize=undefined)
endif()
endif()
@@ -185,7 +189,9 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
message(STATUS "x86 detected")
if (MSVC)
if (LLAMA_AVX2)
if (LLAMA_AVX512)
add_compile_options(/arch:AVX512)
elseif (LLAMA_AVX2)
add_compile_options(/arch:AVX2)
elseif (LLAMA_AVX)
add_compile_options(/arch:AVX)
@@ -201,6 +207,12 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
if (LLAMA_AVX2)
add_compile_options(-mavx2)
endif()
if (LLAMA_AVX512)
add_compile_options(-mavx512f)
# add_compile_options(-mavx512cd)
# add_compile_options(-mavx512dq)
# add_compile_options(-mavx512bw)
endif()
endif()
else()
# TODO: support PowerPC
@@ -211,17 +223,6 @@ endif()
# Build libraries
#
add_library(utils OBJECT
utils.cpp
utils.h)
target_include_directories(utils PUBLIC .)
target_compile_features(utils PUBLIC cxx_std_11) # don't bump
target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
if (BUILD_SHARED_LIBS)
set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
add_library(ggml OBJECT
ggml.c
ggml.h)
@@ -239,22 +240,12 @@ add_library(llama
target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
if (BUILD_SHARED_LIBS)
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
endif()
#
# Executables
#
add_executable(main main.cpp)
target_link_libraries(main PRIVATE llama ggml utils)
add_executable(quantize quantize.cpp)
target_link_libraries(quantize PRIVATE llama ggml utils)
#
# programs, examples and tests
#
@@ -264,6 +255,6 @@ if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
add_subdirectory(tests)
endif ()
#if (LLAMA_BUILD_EXAMPLES)
# add_subdirectory(examples)
#endif()
if (LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
endif()

View File

@@ -212,7 +212,7 @@ $(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info )
default: main quantize
default: main quantize perplexity
#
# Build library
@@ -224,20 +224,23 @@ ggml.o: ggml.c ggml.h
llama.o: llama.cpp llama.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
utils.o: utils.cpp utils.h
$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
clean:
rm -f *.o main quantize
rm -vf *.o main quantize perplexity
main: main.cpp ggml.o llama.o utils.o
$(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o -o main $(LDFLAGS)
main: examples/main/main.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo
quantize: quantize.cpp ggml.o llama.o utils.o
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS)
quantize: examples/quantize/quantize.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
#
# Tests

View File

@@ -1,5 +1,7 @@
# llama.cpp
![llama](https://user-images.githubusercontent.com/1991296/227761327-6d83e30e-2200-41a6-bfbb-f575231c54f4.png)
[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
@@ -179,7 +181,10 @@ Here is an example few-shot interaction, invoked with the command
```bash
# default arguments using 7B model
./chat.sh
./examples/chat.sh
# advanced chat with 13B model
./examples/chat-13B.sh
# custom arguments using 13B model
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
@@ -195,7 +200,7 @@ Note the use of `--color` to distinguish between user input and generated text.
2. Run the `main` tool like this:
```
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins
./examples/alpaca.sh
```
Sample run:
@@ -243,7 +248,7 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
### Perplexity (Measuring model quality)
You can pass `--perplexity` as a command line option to measure perplexity over the given prompt. For more background,
You can use the `perplexity` example to measure perplexity over the given prompt. For more background,
see https://huggingface.co/docs/transformers/perplexity. However, in general, lower perplexity is better for LLMs.
#### Latest measurements
@@ -266,10 +271,10 @@ Perplexity - model options
#### How to run
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
2. Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
3. Output:
```
Calculating perplexity over 655 chunks
perplexity : calculating perplexity over 655 chunks
24.43 seconds per pass - ETA 4.45 hours
[1]4.5970,[2]5.1807,[3]6.0382,...
```

View File

@@ -1,6 +0,0 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
./main -m ./models/7B/ggml-model-q4_0.bin -b 128 -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt

View File

@@ -161,7 +161,7 @@ def main():
for p in range(n_parts):
print(f"Processing part {p}\n")
print(f"Processing part {p+1} of {n_parts}\n")
fname_model = f"{dir_model}/consolidated.0{p}.pth"
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"

36
examples/CMakeLists.txt Normal file
View File

@@ -0,0 +1,36 @@
# dependencies
find_package(Threads REQUIRED)
# third-party
# ...
# common
set(TARGET common)
add_library(${TARGET} OBJECT
common.h
common.cpp
)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE llama)
# examples
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN)
else()
add_subdirectory(main)
add_subdirectory(quantize)
add_subdirectory(perplexity)
add_subdirectory(embedding)
endif()

View File

@@ -1,6 +1,10 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7

16
examples/chat.sh Executable file
View File

@@ -0,0 +1,16 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
# Important:
#
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
#
./main -m ./models/7B/ggml-model-q4_0.bin -c 512 -b 1024 -n 256 --keep 48 \
--repeat_penalty 1.0 --color -i \
-r "User:" -f prompts/chat-with-bob.txt

View File

@@ -1,6 +1,6 @@
#include "ggml.h"
#include "common.h"
#include "utils.h"
#include "ggml.h"
#include <cassert>
#include <cstring>
@@ -112,6 +112,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
}
params.n_batch = std::stoi(argv[i]);
params.n_batch = std::min(512, params.n_batch);
} else if (arg == "--keep") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_keep = std::stoi(argv[i]);
} else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_param = true;
@@ -134,6 +140,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.use_mlock = true;
} else if (arg == "--mtest") {
params.mem_test = true;
} else if (arg == "--verbose-prompt") {
params.verbose_prompt = true;
} else if (arg == "-r" || arg == "--reverse-prompt") {
if (++i >= argc) {
invalid_param = true;
@@ -196,7 +204,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stderr, " -f FNAME, --file FNAME\n");
fprintf(stderr, " prompt file to start generation.\n");
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 - infinity)\n", params.n_predict);
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
@@ -208,10 +216,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
fprintf(stderr, " --keep number of tokens to keep from the initial prompt\n");
if (ggml_mlock_supported()) {
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");

View File

@@ -21,6 +21,7 @@ struct gpt_params {
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; // context size
int32_t n_batch = 8; // batch size for prompt processing
int32_t n_keep = 0; // number of tokens to keep from initial prompt
// sampling parameters
int32_t top_k = 40;
@@ -48,6 +49,7 @@ struct gpt_params {
bool perplexity = false; // compute perplexity over the prompt
bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool verbose_prompt = false; // print prompt tokens before generation
};
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

View File

@@ -0,0 +1,4 @@
set(TARGET embedding)
add_executable(${TARGET} embedding.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -0,0 +1,3 @@
# embedding
TODO

View File

@@ -0,0 +1,101 @@
#include "common.h"
#include "llama.h"
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
params.embedding = true;
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
if (params.seed <= 0) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_context * ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
}
int n_past = 0;
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
if (params.verbose_prompt) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
}
fprintf(stderr, "\n");
}
if (params.embedding){
if (embd_inp.size() > 0) {
if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
const int n_embd = llama_n_embd(ctx);
const auto embeddings = llama_get_embeddings(ctx);
for (int i = 0; i < n_embd; i++) {
printf("%f ", embeddings[i]);
}
printf("\n");
}
llama_print_timings(ctx);
llama_free(ctx);
return 0;
}

View File

@@ -0,0 +1,4 @@
set(TARGET main)
add_executable(${TARGET} main.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

3
examples/main/README.md Normal file
View File

@@ -0,0 +1,3 @@
# main
TODO

View File

@@ -1,5 +1,4 @@
#include "utils.h"
#include "ggml.h"
#include "common.h"
#include "llama.h"
#include <cassert>
@@ -24,6 +23,8 @@
extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
#endif
#define ANSI_COLOR_RED "\x1b[31m"
@@ -45,8 +46,7 @@ enum console_state {
static console_state con_st = CONSOLE_STATE_DEFAULT;
static bool con_use_color = false;
void set_console_state(console_state new_st)
{
void set_console_state(console_state new_st) {
if (!con_use_color) return;
// only emit color code if state changed
if (new_st != con_st) {
@@ -65,79 +65,6 @@ void set_console_state(console_state new_st)
}
}
std::vector<double> softmax(const std::vector<float>& logits) {
std::vector<double> probs(logits.size());
float max_logit = logits[0];
for (float v : logits) max_logit = std::max(max_logit, v);
double sum_exp = 0.0;
for (size_t i = 0; i < logits.size(); i++) {
// Subtract the maximum logit value from the current logit value for numerical stability
float logit = logits[i] - max_logit;
double exp_logit = std::exp(logit);
sum_exp += exp_logit;
probs[i] = exp_logit;
}
for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
return probs;
}
void perplexity(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
int count = 0;
double nll = 0.0;
int seq_count = tokens.size() / params.n_ctx;
fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
for (int i = 0; i < seq_count; ++i) {
int start = i * params.n_ctx;
int end = start + params.n_ctx - 1;
std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
auto start_t = std::chrono::high_resolution_clock::now();
if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
auto end_t = std::chrono::high_resolution_clock::now();
if (i == 0) {
double seconds = std::chrono::duration<double>(end_t - start_t).count();
printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
}
// We get the logits for all the tokens in the context window (params.n_ctx)
// from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
// calculate the perplexity over the last half the window (so the model always has
// some context to predict the token).
//
// We rely on the fact that attention in the forward pass only looks at previous
// tokens here, so the logits returned for each token are an accurate representation
// of what the model would have predicted at that point.
//
// Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt.
auto logits = llama_get_logits(ctx);
for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones.
int n_vocab = llama_n_vocab(ctx);
std::vector<float> tok_logits(
logits + j * n_vocab,
logits + (j + 1) * n_vocab);
double prob = softmax(tok_logits)[tokens[start + j + 1]];
nll += -std::log(prob);
++count;
}
// perplexity is e^(average negative log-likelihood)
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
}
printf("\n");
}
static bool is_interacting = false;
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
@@ -154,10 +81,33 @@ void sigint_handler(int signo) {
}
#endif
int main(int argc, char ** argv) {
// has to be called once at the start of the program to init ggml stuff
ggml_time_init();
#if defined (_WIN32)
void win32_console_init(void) {
unsigned long dwMode = 0;
void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
if (!hConOut || hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode)) {
hConOut = GetStdHandle((unsigned long)-12); // STD_ERROR_HANDLE (-12)
if (hConOut && (hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode))) {
hConOut = 0;
}
}
if (hConOut) {
// Enable ANSI colors on Windows 10+
if (con_use_color && !(dwMode & 0x4)) {
SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
}
// Set console output codepage to UTF8
SetConsoleOutputCP(65001); // CP_UTF8
}
void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) {
// Set console input codepage to UTF8
SetConsoleCP(65001); // CP_UTF8
}
}
#endif
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
@@ -165,6 +115,31 @@ int main(int argc, char ** argv) {
return 1;
}
// save choice to use color for later
// (note for later: this is a slightly awkward choice)
con_use_color = params.use_color;
#if defined (_WIN32)
win32_console_init();
#endif
if (params.perplexity) {
printf("\n************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
printf("************\n\n");
return 0;
}
if (params.embedding) {
printf("\n************\n");
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
printf("************\n\n");
return 0;
}
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
@@ -181,10 +156,6 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng);
}
// save choice to use color for later
// (note for later: this is a slightly awkward choice)
con_use_color = params.use_color;
// params.prompt = R"(// this function checks if the number n is prime
//bool is_prime(int n) {)";
@@ -198,9 +169,7 @@ int main(int argc, char ** argv) {
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
@@ -236,13 +205,6 @@ int main(int argc, char ** argv) {
return 0;
}
if (params.perplexity) {
perplexity(ctx, params);
exit(0);
}
int n_past = 0;
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
@@ -251,7 +213,12 @@ int main(int argc, char ** argv) {
const int n_ctx = llama_n_ctx(ctx);
params.n_predict = std::min(params.n_predict, n_ctx - (int) embd_inp.size());
if ((int) embd_inp.size() > n_ctx - 4) {
fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
params.n_keep = std::min(params.n_keep, (int) embd_inp.size());
// prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
@@ -275,13 +242,23 @@ int main(int argc, char ** argv) {
// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
if (params.verbose_prompt) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
}
if (params.n_keep > 0) {
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
}
fprintf(stderr, "'\n");
}
fprintf(stderr, "\n");
}
fprintf(stderr, "\n");
if (params.interactive) {
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
@@ -295,7 +272,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: interactive mode on.\n", __func__);
if(params.antiprompt.size()) {
if (params.antiprompt.size()) {
for (auto antiprompt : params.antiprompt) {
fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
}
@@ -305,14 +282,12 @@ int main(int argc, char ** argv) {
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
}
}
fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
fprintf(stderr, "\n\n");
std::vector<llama_token> embd;
int last_n_size = params.repeat_last_n;
std::vector<llama_token> last_n_tokens(last_n_size);
// TODO: replace with ring-buffer
std::vector<llama_token> last_n_tokens(n_ctx);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
if (params.interactive) {
@@ -325,48 +300,41 @@ int main(int argc, char ** argv) {
is_interacting = params.interactive_start || params.instruct;
}
int input_consumed = 0;
bool input_noecho = false;
int remaining_tokens = params.n_predict;
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
#if defined (_WIN32)
if (params.use_color) {
// Enable ANSI colors on Windows 10+
unsigned long dwMode = 0;
void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) {
SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
}
}
#endif
// the first thing we will do is to output the prompt, so set color accordingly
set_console_state(CONSOLE_STATE_PROMPT);
if (params.embedding){
embd = embd_inp;
std::vector<llama_token> embd;
if (embd.size() > 0) {
if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
const auto embeddings = llama_get_embeddings(ctx);
// TODO: print / use the embeddings
if (params.use_color) {
printf(ANSI_COLOR_RESET);
}
return 0;
}
while (remaining_tokens > 0 || params.interactive) {
while (n_remain != 0 || params.interactive) {
// predict
if (embd.size() > 0) {
// infinite text generation via context swapping
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
if (n_past + (int) embd.size() > n_ctx) {
const int n_left = n_past - params.n_keep;
n_past = params.n_keep;
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
//printf("\n---\n");
//printf("resetting: '");
//for (int i = 0; i < (int) embd.size(); i++) {
// printf("%s", llama_token_to_str(ctx, embd[i]));
//}
//printf("'\n");
//printf("\n---\n");
}
if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
@@ -376,7 +344,7 @@ int main(int argc, char ** argv) {
n_past += embd.size();
embd.clear();
if ((int) embd_inp.size() <= input_consumed && !is_interacting) {
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
// out of user input, sample next token
const float top_k = params.top_k;
const float top_p = params.top_p;
@@ -389,14 +357,12 @@ int main(int argc, char ** argv) {
auto logits = llama_get_logits(ctx);
if (params.ignore_eos) {
// set the logit of the eos token to zero to avoid sampling it
//logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0;
// TODO: this does not work of params.logits_all == true
assert(params.perplexity == false);
logits[llama_token_eos()] = 0;
}
id = llama_sample_top_p_top_k(ctx, last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_penalty);
id = llama_sample_top_p_top_k(ctx,
last_n_tokens.data() + n_ctx - params.repeat_last_n,
params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
@@ -419,14 +385,14 @@ int main(int argc, char ** argv) {
input_noecho = false;
// decrement remaining sampling budget
--remaining_tokens;
--n_remain;
} else {
// some user input remains from prompt or interaction, forward it to processing
while ((int) embd_inp.size() > input_consumed) {
embd.push_back(embd_inp[input_consumed]);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(embd_inp[input_consumed]);
++input_consumed;
last_n_tokens.push_back(embd_inp[n_consumed]);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
break;
}
@@ -441,13 +407,13 @@ int main(int argc, char ** argv) {
fflush(stdout);
}
// reset color to default if we there is no pending user input
if (!input_noecho && (int)embd_inp.size() == input_consumed) {
if (!input_noecho && (int)embd_inp.size() == n_consumed) {
set_console_state(CONSOLE_STATE_DEFAULT);
}
// in interactive mode, and not currently processing queued inputs;
// check if we should prompt the user for more
if (params.interactive && (int) embd_inp.size() <= input_consumed) {
if (params.interactive && (int) embd_inp.size() <= n_consumed) {
// check for reverse prompt
std::string last_output;
for (auto id : last_n_tokens) {
@@ -469,7 +435,7 @@ int main(int argc, char ** argv) {
set_console_state(CONSOLE_STATE_USER_INPUT);
if (params.instruct) {
input_consumed = embd_inp.size();
n_consumed = embd_inp.size();
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
printf("\n> ");
@@ -484,7 +450,10 @@ int main(int argc, char ** argv) {
std::string line;
bool another_line = true;
do {
std::getline(std::cin, line);
if (!std::getline(std::cin, line)) {
// input stream is bad or EOF received
return 0;
}
if (line.empty() || line.back() != '\\') {
another_line = false;
} else {
@@ -503,7 +472,7 @@ int main(int argc, char ** argv) {
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
}
remaining_tokens -= line_inp.size();
n_remain -= line_inp.size();
input_noecho = true; // do not echo this again
}
@@ -524,8 +493,8 @@ int main(int argc, char ** argv) {
}
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
if (params.interactive && remaining_tokens <= 0) {
remaining_tokens = params.n_predict;
if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
n_remain = params.n_predict;
is_interacting = true;
}
}

View File

@@ -0,0 +1,4 @@
set(TARGET perplexity)
add_executable(${TARGET} perplexity.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -0,0 +1,3 @@
# perplexity
TODO

View File

@@ -0,0 +1,138 @@
#include "common.h"
#include "llama.h"
std::vector<double> softmax(const std::vector<float>& logits) {
std::vector<double> probs(logits.size());
float max_logit = logits[0];
for (float v : logits) max_logit = std::max(max_logit, v);
double sum_exp = 0.0;
for (size_t i = 0; i < logits.size(); i++) {
// Subtract the maximum logit value from the current logit value for numerical stability
float logit = logits[i] - max_logit;
double exp_logit = std::exp(logit);
sum_exp += exp_logit;
probs[i] = exp_logit;
}
for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
return probs;
}
void perplexity(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
int count = 0;
double nll = 0.0;
int seq_count = tokens.size() / params.n_ctx;
fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
for (int i = 0; i < seq_count; ++i) {
int start = i * params.n_ctx;
int end = start + params.n_ctx - 1;
std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
auto start_t = std::chrono::high_resolution_clock::now();
if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
auto end_t = std::chrono::high_resolution_clock::now();
if (i == 0) {
double seconds = std::chrono::duration<double>(end_t - start_t).count();
printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
}
// We get the logits for all the tokens in the context window (params.n_ctx)
// from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
// calculate the perplexity over the last half the window (so the model always has
// some context to predict the token).
//
// We rely on the fact that attention in the forward pass only looks at previous
// tokens here, so the logits returned for each token are an accurate representation
// of what the model would have predicted at that point.
//
// Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt.
auto logits = llama_get_logits(ctx);
for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones.
int n_vocab = llama_n_vocab(ctx);
std::vector<float> tok_logits(
logits + j * n_vocab,
logits + (j + 1) * n_vocab);
double prob = softmax(tok_logits)[tokens[start + j + 1]];
nll += -std::log(prob);
++count;
}
// perplexity is e^(average negative log-likelihood)
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
}
printf("\n");
}
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
params.perplexity = true;
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
if (params.seed <= 0) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_context * ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
}
perplexity(ctx, params);
llama_print_timings(ctx);
llama_free(ctx);
return 0;
}

View File

@@ -0,0 +1,4 @@
set(TARGET quantize)
add_executable(${TARGET} quantize.cpp)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -0,0 +1,3 @@
# quantize
TODO

1125
ggml.c

File diff suppressed because it is too large Load Diff

View File

@@ -1261,10 +1261,10 @@ static llama_vocab::id llama_sample_top_p_top_k(
double repeat_penalty) {
auto & rng = lctx.rng;
const auto & vocab = lctx.vocab;
const auto & logits = lctx.logits;
const int n_logits = lctx.model.hparams.n_vocab;
int n_logits = vocab.id_to_token.size();
const auto & logits = lctx.logits;
const auto * plogits = logits.data() + logits.size() - n_logits;
std::vector<std::pair<double, llama_vocab::id>> logits_id;
logits_id.reserve(n_logits);
@@ -1276,13 +1276,13 @@ static llama_vocab::id llama_sample_top_p_top_k(
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
if (logits[i] < 0.0) {
logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i));
if (plogits[i] < 0.0) {
logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
} else {
logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i));
logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
}
} else {
logits_id.push_back(std::make_pair(logits[i]*scale, i));
logits_id.push_back(std::make_pair(plogits[i]*scale, i));
}
}
}
@@ -1677,6 +1677,8 @@ struct llama_context * llama_init_from_file(
}
const auto & hparams = ctx->model.hparams;
// resized during inference
if (params.logits_all) {
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
} else {
@@ -1684,7 +1686,7 @@ struct llama_context * llama_init_from_file(
}
if (params.embedding){
ctx->embedding.reserve(hparams.n_embd);
ctx->embedding.resize(hparams.n_embd);
}
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
@@ -1761,6 +1763,10 @@ int llama_n_ctx(struct llama_context * ctx) {
return ctx->model.hparams.n_ctx;
}
int llama_n_embd(struct llama_context * ctx) {
return ctx->model.hparams.n_embd;
}
float * llama_get_logits(struct llama_context * ctx) {
return ctx->logits.data();
}

View File

@@ -109,6 +109,7 @@ extern "C" {
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
LLAMA_API int llama_n_embd (struct llama_context * ctx);
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row

View File

@@ -1,7 +1,7 @@
function(llama_add_test source)
get_filename_component(TEST_TARGET ${source} NAME_WE)
add_executable(${TEST_TARGET} ${source})
target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
target_link_libraries(${TEST_TARGET} PRIVATE llama)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
endfunction()

View File

@@ -1,9 +1,9 @@
#include "utils.h"
#include "llama.h"
#include <cstdio>
#include <string>
#include <map>
#include <vector>
static const std::map<std::string, std::vector<llama_token>> k_tests = {
{ "Hello World", { 1, 10994, 2787, }, },
@@ -48,7 +48,9 @@ int main(int argc, char **argv) {
}
for (const auto & test_kv : k_tests) {
const auto res = ::llama_tokenize(ctx, test_kv.first, true);
std::vector<llama_token> res(test_kv.first.size());
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true);
res.resize(n);
bool correct = res.size() == test_kv.second.size();