Compare commits

...

29 Commits

Author SHA1 Message Date
Jed Fox
20e1e84884 deploy : add a Package.swift for SwiftPM support (#393)
* Add a Package.swift for SwiftPM support

* Swap from exclusions to allowlist
2023-03-28 19:39:01 +03:00
Stephan Walter
c1f885067c ggml : introduce structs for the q4 data blocks (#356)
* Introduce structs for the q4 data blocks

* ggml : rename quant struct variables + fix ARM_NEON

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-03-28 18:56:03 +03:00
Georgi Gerganov
e0670260fb gitignore : add "embedding" 2023-03-28 18:34:35 +03:00
dotpy314
28ba975aea Check the existence of f16_model_path_base in quantize.py (#574)
Co-authored-by: Jincheng Miao <jincheng.miao@gmail.com>
2023-03-28 18:06:28 +03:00
slaren
a6bdc47cba Fix usage of F16C intrinsics in AVX code (#563)
* Fix usage of F16C intrinsics in AVX code when F16C is not defined
2023-03-28 17:26:55 +03:00
anzz1
7b8dbcb78b main.cpp fixes, refactoring (#571)
- main: entering empty line passes back control without new input in interactive/instruct modes
- instruct mode: keep prompt fix
- instruct mode: duplicate instruct prompt fix
- refactor: move common console code from main->common
2023-03-28 17:09:55 +03:00
RJ Adriaansen
4b8efff0e3 Add embedding example to Makefile (#540) 2023-03-28 09:11:09 +03:00
Marco Matthies
7e5395575a Fix missing ggml link in cmake for examples/* on w64-mingw32 (#542) 2023-03-27 07:55:26 +03:00
Erik Scholz
34c1072e49 ci: add debug build to sanitizer build matrix (#527) 2023-03-26 15:48:40 +00:00
Stephan Walter
939ad2d3a5 Fix undefined variables in debug build, remove unused variables (#531) 2023-03-26 15:34:02 +00:00
Juan Calderon-Perez
8c2ec5e21d Add support for linux/arm64 platform during Docker Builds (#514)
* Add support for linux/arm64 platform

* Add platform to versioned builds
2023-03-26 14:48:42 +00:00
Stephan Walter
b391579db9 Update README and comments for standalone perplexity tool (#525) 2023-03-26 16:14:01 +03:00
anzz1
7a87d31f4f [main] fix infinite generation (-n == -1) (#523) 2023-03-26 16:06:10 +03:00
Georgi Gerganov
348d6926ee Add logo to README.md 2023-03-26 10:20:49 +03:00
Harald Fernengel
33e35b8fe8 Exit from interactive mode if input stream is bad (#491)
Allow exiting the interactive prompt also with CTRL-D on Unix and CTRL-Z
on Windows.
2023-03-26 08:25:46 +03:00
anzz1
19726169b3 CI: Run other sanitizer builds even if one fails (#511)
applies only to sanitizer builds so they wont be cancelled
2023-03-26 00:13:28 +02:00
jp-x-g
f732695cd5 Clarify console output in convert-pth-to-ggml.py (#512)
"Processing part 1 of 3" instead of "Processing part 0"
2023-03-25 23:53:55 +02:00
anzz1
2f7bf7dd7c CMake / CI additions (#497)
* CMake: Add AVX512 option

* CI: Add AVX/AVX512 builds (Windows)
(AVX512 tests can only be run when the worker happens to support it, building works anyway)

* CMake: Fix sanitizer linkage ( merged #468 )

* CI: Add sanitizer builds (Ubuntu)

* CI: Fix release tagging
(change @zendesk/action-create-release to @anzz1/action-create-release until upstream PR Added commitish as input zendesk/action-create-release#32 is merged)
2023-03-25 23:38:11 +02:00
anzz1
34ab526843 (Windows) Set console to UTF-8 on init (#420)
Sets console codepage to 65001 (CP_UTF8) on start for both input and output, should fix problems with UTF-8 characters.
2023-03-25 22:29:22 +02:00
Georgi Gerganov
c2b25b6912 Fix colors enabling on WIN32 2023-03-25 21:53:39 +02:00
Georgi Gerganov
79b2b266db If n_predict == -1, generate forever 2023-03-25 21:51:41 +02:00
Georgi Gerganov
e2d490dafd Inifinite generation via context swapping (#71) 2023-03-25 21:36:22 +02:00
Georgi Gerganov
03f7e33560 Cleanup STL headers + fix embedding examples + minor stuff 2023-03-25 20:51:14 +02:00
Georgi Gerganov
55ad42af84 Move chat scripts into "./examples" 2023-03-25 20:37:09 +02:00
slaren
459e93cce0 Add AVX2 implementation of dequantize_row_q4_1 (#505) 2023-03-25 20:31:48 +02:00
Georgi Gerganov
a316a425d0 Overhaul the examples structure
- main -> examples
- utils -> examples (renamed to "common")
- quantize -> examples
- separate tools for "perplexity" and "embedding"

Hope I didn't break something !
2023-03-25 20:26:40 +02:00
Georgi Gerganov
ecbe466a36 Retire the ggml_mul_mat() branch for transposed src0 (#500)
* Retire the ggml_mul_mat() for transposed src0

- It can always be made contiguous with ggml_cpy()
- The code is now simplified
- The results are deterministic in respect to num threads

* SIMD-ify dequantize_row_q4_0() for ARM_NEON (#502)

* Attempt to SIMD-ify dequantize_row_q4_0() for ARM_NEON

* Fix dequantization - forgot to interleave the quants
2023-03-25 19:47:21 +02:00
Georgi Gerganov
502a400192 Disable prompt verbosity by default and add option to enable (#480) 2023-03-25 17:17:16 +02:00
slaren
09aecbf628 Add AVX2 implementation of dequantize_row_q4_0 (#467) 2023-03-25 17:06:49 +02:00
37 changed files with 1596 additions and 1618 deletions

View File

@@ -64,6 +64,42 @@ jobs:
cd build
ctest --output-on-failure
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest
continue-on-error: true
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release]
accelerate: [ON, OFF]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_ACCELERATE=${{ matrix.accelerate }}
cmake --build . --config ${{ matrix.build_type }}
- name: Test
id: cmake_test
run: |
cd build
ctest --output-on-failure
macOS-latest-make:
runs-on: macos-latest
@@ -112,6 +148,16 @@ jobs:
windows-latest-cmake:
runs-on: windows-latest
strategy:
matrix:
include:
- build: 'avx2'
defines: ''
- build: 'avx'
defines: '-DLLAMA_AVX2=OFF'
- build: 'avx512'
defines: '-DLLAMA_AVX512=ON'
steps:
- name: Clone
id: checkout
@@ -122,11 +168,21 @@ jobs:
run: |
mkdir build
cd build
cmake ..
cmake .. ${{ matrix.defines }}
cmake --build . --config Release
- name: Check AVX512F support
id: check_avx512f
if: ${{ matrix.build == 'avx512' }}
continue-on-error: true
run: |
cd build
Set-Content -Path .\avx512f.exe -Value ([Convert]::FromBase64String('TVqQAAMAAAAEAAAA//8AALgAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAyAAAAA4fug4AtAnNIbgBTM0hVGhpcyBwcm9ncmFtIGNhbm5vdCBiZSBydW4gaW4gRE9TIG1vZGUuDQ0KJAAAAAAAAAClmfXY4fibi+H4m4vh+JuL4fiai+P4m4si98aL4vibi7Xbq4vg+JuLUmljaOH4m4sAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABQRQAATAEBAGo6H2QAAAAAAAAAAOAADwELAQYAAAIAAAAAAAAAAAAADBAAAAAQAAAAIAAAAABAAAAQAAAAAgAABAAAAAAAAAAEAAAAAAAAAAAgAAAAAgAAAAAAAAMAAAAAABAAABAAAAAAEAAAEAAAAAAAABAAAAAAAAAAAAAAAFQQAAAoAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAADAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC50ZXh0AAAAsgAAAAAQAAAAAgAAAAIAAAAAAAAAAAAAAAAAACAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACUEAAAiBAAAAAAAABVi+xRUVNTuAcAAAAPosHrEGaD4wGJXfxbg0X8MI1F+GoAUI1F/GoBUGr1/xUAEEAAUP8VBBBAAItF/FuDwND32BvAQMnDzMx8EAAAAAAAAAAAAACkEAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAlBAAAIgQAAAAAAAApANXcml0ZUZpbGUAuQFHZXRTdGRIYW5kbGUAAEtFUk5FTDMyLmRsbAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA==')) -AsByteStream
.\avx512f.exe && echo " AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo " AVX512F: NO"
- name: Test
id: cmake_test
if: ${{ matrix.build != 'avx512' || env.HAS_AVX512F == '1' }} # Test AVX-512 only when possible
run: |
cd build
ctest -C Release --output-on-failure
@@ -140,12 +196,39 @@ jobs:
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\bin\Release\*
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
with:
path: |
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
runs-on: ubuntu-latest
needs:
- ubuntu-latest-make
- ubuntu-latest-cmake
- macOS-latest-make
- macOS-latest-cmake
- windows-latest-cmake
steps:
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v3
- name: Get commit hash
id: commit
uses: pr-mpt/actions-commit-hash@v2
- name: Create release
id: create_release
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: zendesk/action-create-release@v1
uses: anzz1/action-create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
@@ -153,15 +236,25 @@ jobs:
- name: Upload release
id: upload_release
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
uses: actions/github-script@v3
with:
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: .\llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
asset_name: llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
asset_content_type: application/octet-stream
github-token: ${{secrets.GITHUB_TOKEN}}
script: |
const path = require('path');
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
for (let file of await fs.readdirSync('./artifact')) {
if (path.extname(file) === '.zip') {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
owner: context.repo.owner,
repo: context.repo.repo,
release_id: release_id,
name: file,
data: await fs.readFileSync(`./artifact/${file}`)
});
}
}
# ubuntu-latest-gcc:
# runs-on: ubuntu-latest

View File

@@ -49,6 +49,7 @@ jobs:
with:
context: .
push: true
platforms: linux/amd64,linux/arm64
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
@@ -57,5 +58,6 @@ jobs:
with:
context: .
push: ${{ github.event_name == 'push' }}
platforms: linux/amd64,linux/arm64
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
file: ${{ matrix.config.dockerfile }}

7
.gitignore vendored
View File

@@ -5,6 +5,7 @@
.vscode/
.DS_Store
.build/
build/
build-em/
build-debug/
@@ -19,9 +20,15 @@ models/*
/main
/quantize
/result
/perplexity
/embedding
arm_neon.h
compile_commands.json
.envrc
.direnv/
.venv
__pycache__
.swiftpm

View File

@@ -54,6 +54,7 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer"
# instruction set specific
option(LLAMA_AVX "llama: enable AVX" ON)
option(LLAMA_AVX2 "llama: enable AVX2" ON)
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_FMA "llama: enable FMA" ON)
# 3rd party libs
@@ -75,14 +76,17 @@ find_package(Threads REQUIRED)
if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD)
add_compile_options(-fsanitize=thread)
link_libraries(-fsanitize=thread)
endif()
if (LLAMA_SANITIZE_ADDRESS)
add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
link_libraries(-fsanitize=address)
endif()
if (LLAMA_SANITIZE_UNDEFINED)
add_compile_options(-fsanitize=undefined)
link_libraries(-fsanitize=undefined)
endif()
endif()
@@ -185,7 +189,9 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
message(STATUS "x86 detected")
if (MSVC)
if (LLAMA_AVX2)
if (LLAMA_AVX512)
add_compile_options(/arch:AVX512)
elseif (LLAMA_AVX2)
add_compile_options(/arch:AVX2)
elseif (LLAMA_AVX)
add_compile_options(/arch:AVX)
@@ -201,6 +207,12 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
if (LLAMA_AVX2)
add_compile_options(-mavx2)
endif()
if (LLAMA_AVX512)
add_compile_options(-mavx512f)
# add_compile_options(-mavx512cd)
# add_compile_options(-mavx512dq)
# add_compile_options(-mavx512bw)
endif()
endif()
else()
# TODO: support PowerPC
@@ -211,17 +223,6 @@ endif()
# Build libraries
#
add_library(utils OBJECT
utils.cpp
utils.h)
target_include_directories(utils PUBLIC .)
target_compile_features(utils PUBLIC cxx_std_11) # don't bump
target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
if (BUILD_SHARED_LIBS)
set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
add_library(ggml OBJECT
ggml.c
ggml.h)
@@ -239,22 +240,12 @@ add_library(llama
target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
if (BUILD_SHARED_LIBS)
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
endif()
#
# Executables
#
add_executable(main main.cpp)
target_link_libraries(main PRIVATE llama ggml utils)
add_executable(quantize quantize.cpp)
target_link_libraries(quantize PRIVATE llama ggml utils)
#
# programs, examples and tests
#
@@ -264,6 +255,6 @@ if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
add_subdirectory(tests)
endif ()
#if (LLAMA_BUILD_EXAMPLES)
# add_subdirectory(examples)
#endif()
if (LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
endif()

View File

@@ -212,7 +212,7 @@ $(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info )
default: main quantize
default: main quantize perplexity embedding
#
# Build library
@@ -224,20 +224,26 @@ ggml.o: ggml.c ggml.h
llama.o: llama.cpp llama.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
utils.o: utils.cpp utils.h
$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
clean:
rm -f *.o main quantize
rm -vf *.o main quantize perplexity embedding
main: main.cpp ggml.o llama.o utils.o
$(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o -o main $(LDFLAGS)
main: examples/main/main.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo
quantize: quantize.cpp ggml.o llama.o utils.o
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS)
quantize: examples/quantize/quantize.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
#
# Tests

20
Package.swift Normal file
View File

@@ -0,0 +1,20 @@
// swift-tools-version:5.3
import PackageDescription
let package = Package(
name: "llama",
products: [
.library(name: "llama", targets: ["llama"]),
],
targets: [
.target(
name: "llama",
path: ".",
sources: ["ggml.c", "llama.cpp"],
publicHeadersPath: "spm-headers",
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"])]
),
],
cxxLanguageStandard: .cxx11
)

View File

@@ -1,5 +1,7 @@
# llama.cpp
![llama](https://user-images.githubusercontent.com/1991296/227761327-6d83e30e-2200-41a6-bfbb-f575231c54f4.png)
[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
@@ -179,7 +181,10 @@ Here is an example few-shot interaction, invoked with the command
```bash
# default arguments using 7B model
./chat.sh
./examples/chat.sh
# advanced chat with 13B model
./examples/chat-13B.sh
# custom arguments using 13B model
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
@@ -195,7 +200,7 @@ Note the use of `--color` to distinguish between user input and generated text.
2. Run the `main` tool like this:
```
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins
./examples/alpaca.sh
```
Sample run:
@@ -243,7 +248,7 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
### Perplexity (Measuring model quality)
You can pass `--perplexity` as a command line option to measure perplexity over the given prompt. For more background,
You can use the `perplexity` example to measure perplexity over the given prompt. For more background,
see https://huggingface.co/docs/transformers/perplexity. However, in general, lower perplexity is better for LLMs.
#### Latest measurements
@@ -266,10 +271,10 @@ Perplexity - model options
#### How to run
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
2. Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
3. Output:
```
Calculating perplexity over 655 chunks
perplexity : calculating perplexity over 655 chunks
24.43 seconds per pass - ETA 4.45 hours
[1]4.5970,[2]5.1807,[3]6.0382,...
```

View File

@@ -1,6 +0,0 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
./main -m ./models/7B/ggml-model-q4_0.bin -b 128 -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt

View File

@@ -161,7 +161,7 @@ def main():
for p in range(n_parts):
print(f"Processing part {p}\n")
print(f"Processing part {p+1} of {n_parts}\n")
fname_model = f"{dir_model}/consolidated.0{p}.pth"
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"

36
examples/CMakeLists.txt Normal file
View File

@@ -0,0 +1,36 @@
# dependencies
find_package(Threads REQUIRED)
# third-party
# ...
# common
set(TARGET common)
add_library(${TARGET} OBJECT
common.h
common.cpp
)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE llama)
# examples
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN)
else()
add_subdirectory(main)
add_subdirectory(quantize)
add_subdirectory(perplexity)
add_subdirectory(embedding)
endif()

View File

@@ -1,6 +1,10 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7

16
examples/chat.sh Executable file
View File

@@ -0,0 +1,16 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
# Important:
#
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
#
./main -m ./models/7B/ggml-model-q4_0.bin -c 512 -b 1024 -n 256 --keep 48 \
--repeat_penalty 1.0 --color -i \
-r "User:" -f prompts/chat-with-bob.txt

View File

@@ -1,6 +1,6 @@
#include "ggml.h"
#include "common.h"
#include "utils.h"
#include "ggml.h"
#include <cassert>
#include <cstring>
@@ -9,11 +9,20 @@
#include <iterator>
#include <algorithm>
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
#include <alloca.h>
#endif
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
#include <alloca.h>
#endif
#if defined (_WIN32)
#pragma comment(lib,"kernel32.lib")
extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
#endif
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
// determine sensible default number of threads.
@@ -112,6 +121,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
}
params.n_batch = std::stoi(argv[i]);
params.n_batch = std::min(512, params.n_batch);
} else if (arg == "--keep") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_keep = std::stoi(argv[i]);
} else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_param = true;
@@ -134,6 +149,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.use_mlock = true;
} else if (arg == "--mtest") {
params.mem_test = true;
} else if (arg == "--verbose-prompt") {
params.verbose_prompt = true;
} else if (arg == "-r" || arg == "--reverse-prompt") {
if (++i >= argc) {
invalid_param = true;
@@ -196,7 +213,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stderr, " -f FNAME, --file FNAME\n");
fprintf(stderr, " prompt file to start generation.\n");
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
@@ -208,10 +225,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
if (ggml_mlock_supported()) {
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");
@@ -246,3 +265,47 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
return res;
}
/* Keep track of current color of output, and emit ANSI code if it changes. */
void set_console_color(console_state & con_st, console_color_t color) {
if (con_st.use_color && con_st.color != color) {
switch(color) {
case CONSOLE_COLOR_DEFAULT:
printf(ANSI_COLOR_RESET);
break;
case CONSOLE_COLOR_PROMPT:
printf(ANSI_COLOR_YELLOW);
break;
case CONSOLE_COLOR_USER_INPUT:
printf(ANSI_BOLD ANSI_COLOR_GREEN);
break;
}
con_st.color = color;
}
}
#if defined (_WIN32)
void win32_console_init(bool enable_color) {
unsigned long dwMode = 0;
void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
if (!hConOut || hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode)) {
hConOut = GetStdHandle((unsigned long)-12); // STD_ERROR_HANDLE (-12)
if (hConOut && (hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode))) {
hConOut = 0;
}
}
if (hConOut) {
// Enable ANSI colors on Windows 10+
if (enable_color && !(dwMode & 0x4)) {
SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
}
// Set console output codepage to UTF8
SetConsoleOutputCP(65001); // CP_UTF8
}
void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) {
// Set console input codepage to UTF8
SetConsoleCP(65001); // CP_UTF8
}
}
#endif

View File

@@ -21,6 +21,7 @@ struct gpt_params {
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; // context size
int32_t n_batch = 8; // batch size for prompt processing
int32_t n_keep = 0; // number of tokens to keep from initial prompt
// sampling parameters
int32_t top_k = 40;
@@ -48,6 +49,7 @@ struct gpt_params {
bool perplexity = false; // compute perplexity over the prompt
bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool verbose_prompt = false; // print prompt tokens before generation
};
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@@ -61,3 +63,33 @@ std::string gpt_random_prompt(std::mt19937 & rng);
//
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
//
// Console utils
//
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_YELLOW "\x1b[33m"
#define ANSI_COLOR_BLUE "\x1b[34m"
#define ANSI_COLOR_MAGENTA "\x1b[35m"
#define ANSI_COLOR_CYAN "\x1b[36m"
#define ANSI_COLOR_RESET "\x1b[0m"
#define ANSI_BOLD "\x1b[1m"
enum console_color_t {
CONSOLE_COLOR_DEFAULT=0,
CONSOLE_COLOR_PROMPT,
CONSOLE_COLOR_USER_INPUT
};
struct console_state {
bool use_color = false;
console_color_t color = CONSOLE_COLOR_DEFAULT;
};
void set_console_color(console_state & con_st, console_color_t color);
#if defined (_WIN32)
void win32_console_init(bool enable_color);
#endif

View File

@@ -0,0 +1,4 @@
set(TARGET embedding)
add_executable(${TARGET} embedding.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -0,0 +1,3 @@
# embedding
TODO

View File

@@ -0,0 +1,101 @@
#include "common.h"
#include "llama.h"
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
params.embedding = true;
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
if (params.seed <= 0) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_context * ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
}
int n_past = 0;
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
if (params.verbose_prompt) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
}
fprintf(stderr, "\n");
}
if (params.embedding){
if (embd_inp.size() > 0) {
if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
const int n_embd = llama_n_embd(ctx);
const auto embeddings = llama_get_embeddings(ctx);
for (int i = 0; i < n_embd; i++) {
printf("%f ", embeddings[i]);
}
printf("\n");
}
llama_print_timings(ctx);
llama_free(ctx);
return 0;
}

View File

@@ -0,0 +1,4 @@
set(TARGET main)
add_executable(${TARGET} main.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

3
examples/main/README.md Normal file
View File

@@ -0,0 +1,3 @@
# main
TODO

454
examples/main/main.cpp Normal file
View File

@@ -0,0 +1,454 @@
#include "common.h"
#include "llama.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#include <signal.h>
#endif
static console_state con_st;
static bool is_interacting = false;
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
void sigint_handler(int signo) {
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
printf("\n"); // this also force flush stdout.
if (signo == SIGINT) {
if (!is_interacting) {
is_interacting=true;
} else {
_exit(130);
}
}
}
#endif
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
// save choice to use color for later
// (note for later: this is a slightly awkward choice)
con_st.use_color = params.use_color;
#if defined (_WIN32)
win32_console_init(params.use_color);
#endif
if (params.perplexity) {
printf("\n************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
printf("************\n\n");
return 0;
}
if (params.embedding) {
printf("\n************\n");
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
printf("************\n\n");
return 0;
}
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
if (params.seed <= 0) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
// params.prompt = R"(// this function checks if the number n is prime
//bool is_prime(int n) {)";
llama_context * ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mlock = params.use_mlock;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
}
// determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
// uncomment the "used_mem" line in llama.cpp to see the results
if (params.mem_test) {
{
const std::vector<llama_token> tmp(params.n_batch, 0);
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
}
{
const std::vector<llama_token> tmp = { 0, };
llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
}
llama_print_timings(ctx);
llama_free(ctx);
return 0;
}
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx);
if ((int) embd_inp.size() > n_ctx - 4) {
fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
// number of tokens to keep when resetting context
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
params.n_keep = (int)embd_inp.size();
}
// prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
// in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) {
params.interactive_start = true;
params.antiprompt.push_back("### Instruction:\n\n");
}
// enable interactive mode if reverse prompt or interactive start is specified
if (params.antiprompt.size() != 0 || params.interactive_start) {
params.interactive = true;
}
// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
if (params.verbose_prompt) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
}
if (params.n_keep > 0) {
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
}
fprintf(stderr, "'\n");
}
fprintf(stderr, "\n");
}
if (params.interactive) {
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = sigint_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
#elif defined (_WIN32)
signal(SIGINT, sigint_handler);
#endif
fprintf(stderr, "%s: interactive mode on.\n", __func__);
if (params.antiprompt.size()) {
for (auto antiprompt : params.antiprompt) {
fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
}
}
if (!params.input_prefix.empty()) {
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
}
}
fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
fprintf(stderr, "\n\n");
// TODO: replace with ring-buffer
std::vector<llama_token> last_n_tokens(n_ctx);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
if (params.interactive) {
fprintf(stderr, "== Running in interactive mode. ==\n"
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
" - Press Ctrl+C to interject at any time.\n"
#endif
" - Press Return to return control to LLaMa.\n"
" - If you want to submit another line, end your input in '\\'.\n\n");
is_interacting = params.interactive_start;
}
bool is_antiprompt = false;
bool input_noecho = false;
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
// the first thing we will do is to output the prompt, so set color accordingly
set_console_color(con_st, CONSOLE_COLOR_PROMPT);
std::vector<llama_token> embd;
while (n_remain != 0 || params.interactive) {
// predict
if (embd.size() > 0) {
// infinite text generation via context swapping
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
if (n_past + (int) embd.size() > n_ctx) {
const int n_left = n_past - params.n_keep;
n_past = params.n_keep;
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
//printf("\n---\n");
//printf("resetting: '");
//for (int i = 0; i < (int) embd.size(); i++) {
// printf("%s", llama_token_to_str(ctx, embd[i]));
//}
//printf("'\n");
//printf("\n---\n");
}
if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
n_past += embd.size();
embd.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
// out of user input, sample next token
const float top_k = params.top_k;
const float top_p = params.top_p;
const float temp = params.temp;
const float repeat_penalty = params.repeat_penalty;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx);
if (params.ignore_eos) {
logits[llama_token_eos()] = 0;
}
id = llama_sample_top_p_top_k(ctx,
last_n_tokens.data() + n_ctx - params.repeat_last_n,
params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
}
// replace end of text token with newline token when in interactive mode
if (id == llama_token_eos() && params.interactive && !params.instruct) {
id = llama_token_newline.front();
if (params.antiprompt.size() != 0) {
// tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
}
}
// add it to the context
embd.push_back(id);
// echo this to console
input_noecho = false;
// decrement remaining sampling budget
--n_remain;
} else {
// some user input remains from prompt or interaction, forward it to processing
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(embd_inp[n_consumed]);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
break;
}
}
}
// display text
if (!input_noecho) {
for (auto id : embd) {
printf("%s", llama_token_to_str(ctx, id));
}
fflush(stdout);
}
// reset color to default if we there is no pending user input
if (!input_noecho && (int)embd_inp.size() == n_consumed) {
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
}
// in interactive mode, and not currently processing queued inputs;
// check if we should prompt the user for more
if (params.interactive && (int) embd_inp.size() <= n_consumed) {
// check for reverse prompt
if (params.antiprompt.size()) {
std::string last_output;
for (auto id : last_n_tokens) {
last_output += llama_token_to_str(ctx, id);
}
is_antiprompt = false;
// Check if each of the reverse prompts appears at the end of the output.
for (std::string & antiprompt : params.antiprompt) {
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
is_interacting = true;
is_antiprompt = true;
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
fflush(stdout);
break;
}
}
}
if (n_past > 0 && is_interacting) {
// potentially set color to indicate we are taking user input
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
if (params.instruct) {
printf("\n> ");
}
std::string buffer;
if (!params.input_prefix.empty()) {
buffer += params.input_prefix;
printf("%s", buffer.c_str());
}
std::string line;
bool another_line = true;
do {
if (!std::getline(std::cin, line)) {
// input stream is bad or EOF received
return 0;
}
if (line.empty() || line.back() != '\\') {
another_line = false;
} else {
line.pop_back(); // Remove the continue character
}
buffer += line + '\n'; // Append the line to the result
} while (another_line);
// done taking input, reset color
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
// Add tokens to embd only if the input buffer is non-empty
// Entering a empty line lets the user pass control back
if (buffer.length() > 1) {
// instruct mode: insert instruction prefix
if (params.instruct && !is_antiprompt) {
n_consumed = embd_inp.size();
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
}
auto line_inp = ::llama_tokenize(ctx, buffer, false);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
// instruct mode: insert response suffix
if (params.instruct) {
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
}
n_remain -= line_inp.size();
}
input_noecho = true; // do not echo this again
}
if (n_past > 0) {
is_interacting = false;
}
}
// end of text token
if (embd.back() == llama_token_eos()) {
if (params.instruct) {
is_interacting = true;
} else {
fprintf(stderr, " [end of text]\n");
break;
}
}
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
n_remain = params.n_predict;
is_interacting = true;
}
}
#if defined (_WIN32)
signal(SIGINT, SIG_DFL);
#endif
llama_print_timings(ctx);
llama_free(ctx);
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
return 0;
}

View File

@@ -0,0 +1,4 @@
set(TARGET perplexity)
add_executable(${TARGET} perplexity.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -0,0 +1,3 @@
# perplexity
TODO

View File

@@ -0,0 +1,138 @@
#include "common.h"
#include "llama.h"
std::vector<double> softmax(const std::vector<float>& logits) {
std::vector<double> probs(logits.size());
float max_logit = logits[0];
for (float v : logits) max_logit = std::max(max_logit, v);
double sum_exp = 0.0;
for (size_t i = 0; i < logits.size(); i++) {
// Subtract the maximum logit value from the current logit value for numerical stability
float logit = logits[i] - max_logit;
double exp_logit = std::exp(logit);
sum_exp += exp_logit;
probs[i] = exp_logit;
}
for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
return probs;
}
void perplexity(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
int count = 0;
double nll = 0.0;
int seq_count = tokens.size() / params.n_ctx;
fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
for (int i = 0; i < seq_count; ++i) {
int start = i * params.n_ctx;
int end = start + params.n_ctx - 1;
std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
auto start_t = std::chrono::high_resolution_clock::now();
if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
auto end_t = std::chrono::high_resolution_clock::now();
if (i == 0) {
double seconds = std::chrono::duration<double>(end_t - start_t).count();
printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
}
// We get the logits for all the tokens in the context window (params.n_ctx)
// from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
// calculate the perplexity over the last half the window (so the model always has
// some context to predict the token).
//
// We rely on the fact that attention in the forward pass only looks at previous
// tokens here, so the logits returned for each token are an accurate representation
// of what the model would have predicted at that point.
//
// Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt.
auto logits = llama_get_logits(ctx);
for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones.
int n_vocab = llama_n_vocab(ctx);
std::vector<float> tok_logits(
logits + j * n_vocab,
logits + (j + 1) * n_vocab);
double prob = softmax(tok_logits)[tokens[start + j + 1]];
nll += -std::log(prob);
++count;
}
// perplexity is e^(average negative log-likelihood)
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
}
printf("\n");
}
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
params.perplexity = true;
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
if (params.seed <= 0) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_context * ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
}
perplexity(ctx, params);
llama_print_timings(ctx);
llama_free(ctx);
return 0;
}

View File

@@ -0,0 +1,4 @@
set(TARGET quantize)
add_executable(${TARGET} quantize.cpp)
target_link_libraries(${TARGET} PRIVATE llama ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -0,0 +1,3 @@
# quantize
TODO

View File

@@ -4,8 +4,6 @@
#include <cstdio>
#include <string>
const int QK = 32;
// usage:
// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
//
@@ -39,7 +37,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_us = ggml_time_us();
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype, QK)) {
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}

1487
ggml.c

File diff suppressed because it is too large Load Diff

4
ggml.h
View File

@@ -748,8 +748,8 @@ enum ggml_opt_result ggml_opt(
// quantization
//
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
//
// system info

View File

@@ -1261,10 +1261,10 @@ static llama_vocab::id llama_sample_top_p_top_k(
double repeat_penalty) {
auto & rng = lctx.rng;
const auto & vocab = lctx.vocab;
const auto & logits = lctx.logits;
const int n_logits = lctx.model.hparams.n_vocab;
int n_logits = vocab.id_to_token.size();
const auto & logits = lctx.logits;
const auto * plogits = logits.data() + logits.size() - n_logits;
std::vector<std::pair<double, llama_vocab::id>> logits_id;
logits_id.reserve(n_logits);
@@ -1276,13 +1276,13 @@ static llama_vocab::id llama_sample_top_p_top_k(
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
if (logits[i] < 0.0) {
logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i));
if (plogits[i] < 0.0) {
logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
} else {
logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i));
logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
}
} else {
logits_id.push_back(std::make_pair(logits[i]*scale, i));
logits_id.push_back(std::make_pair(plogits[i]*scale, i));
}
}
}
@@ -1345,7 +1345,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
//
// TODO: reuse code from the llama_model_load() somehow
bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype, int qk) {
static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
ggml_type type = GGML_TYPE_Q4_1;
switch (itype) {
@@ -1568,11 +1568,11 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
switch (type) {
case GGML_TYPE_Q4_0:
{
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], qk, hist_cur.data());
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_1:
{
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], qk, hist_cur.data());
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
default:
{
@@ -1677,6 +1677,8 @@ struct llama_context * llama_init_from_file(
}
const auto & hparams = ctx->model.hparams;
// resized during inference
if (params.logits_all) {
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
} else {
@@ -1684,7 +1686,7 @@ struct llama_context * llama_init_from_file(
}
if (params.embedding){
ctx->embedding.reserve(hparams.n_embd);
ctx->embedding.resize(hparams.n_embd);
}
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
@@ -1709,9 +1711,8 @@ void llama_free(struct llama_context * ctx) {
int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
int itype,
int qk) {
if (!llama_model_quantize_internal(fname_inp, fname_out, itype, qk)) {
int itype) {
if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
fprintf(stderr, "%s: failed to quantize\n", __func__);
return 1;
}
@@ -1761,6 +1762,10 @@ int llama_n_ctx(struct llama_context * ctx) {
return ctx->model.hparams.n_ctx;
}
int llama_n_embd(struct llama_context * ctx) {
return ctx->model.hparams.n_embd;
}
float * llama_get_logits(struct llama_context * ctx) {
return ctx->logits.data();
}

View File

@@ -81,8 +81,7 @@ extern "C" {
LLAMA_API int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
int itype,
int qk);
int itype);
// Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process
@@ -109,6 +108,7 @@ extern "C" {
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
LLAMA_API int llama_n_embd (struct llama_context * ctx);
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row

543
main.cpp
View File

@@ -1,543 +0,0 @@
#include "utils.h"
#include "ggml.h"
#include "llama.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#include <signal.h>
#endif
#if defined (_WIN32)
#pragma comment(lib,"kernel32.lib")
extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
#endif
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_YELLOW "\x1b[33m"
#define ANSI_COLOR_BLUE "\x1b[34m"
#define ANSI_COLOR_MAGENTA "\x1b[35m"
#define ANSI_COLOR_CYAN "\x1b[36m"
#define ANSI_COLOR_RESET "\x1b[0m"
#define ANSI_BOLD "\x1b[1m"
/* Keep track of current color of output, and emit ANSI code if it changes. */
enum console_state {
CONSOLE_STATE_DEFAULT=0,
CONSOLE_STATE_PROMPT,
CONSOLE_STATE_USER_INPUT
};
static console_state con_st = CONSOLE_STATE_DEFAULT;
static bool con_use_color = false;
void set_console_state(console_state new_st)
{
if (!con_use_color) return;
// only emit color code if state changed
if (new_st != con_st) {
con_st = new_st;
switch(con_st) {
case CONSOLE_STATE_DEFAULT:
printf(ANSI_COLOR_RESET);
return;
case CONSOLE_STATE_PROMPT:
printf(ANSI_COLOR_YELLOW);
return;
case CONSOLE_STATE_USER_INPUT:
printf(ANSI_BOLD ANSI_COLOR_GREEN);
return;
}
}
}
std::vector<double> softmax(const std::vector<float>& logits) {
std::vector<double> probs(logits.size());
float max_logit = logits[0];
for (float v : logits) max_logit = std::max(max_logit, v);
double sum_exp = 0.0;
for (size_t i = 0; i < logits.size(); i++) {
// Subtract the maximum logit value from the current logit value for numerical stability
float logit = logits[i] - max_logit;
double exp_logit = std::exp(logit);
sum_exp += exp_logit;
probs[i] = exp_logit;
}
for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
return probs;
}
void perplexity(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
int count = 0;
double nll = 0.0;
int seq_count = tokens.size() / params.n_ctx;
fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
for (int i = 0; i < seq_count; ++i) {
int start = i * params.n_ctx;
int end = start + params.n_ctx - 1;
std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
auto start_t = std::chrono::high_resolution_clock::now();
if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
auto end_t = std::chrono::high_resolution_clock::now();
if (i == 0) {
double seconds = std::chrono::duration<double>(end_t - start_t).count();
printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
}
// We get the logits for all the tokens in the context window (params.n_ctx)
// from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
// calculate the perplexity over the last half the window (so the model always has
// some context to predict the token).
//
// We rely on the fact that attention in the forward pass only looks at previous
// tokens here, so the logits returned for each token are an accurate representation
// of what the model would have predicted at that point.
//
// Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt.
auto logits = llama_get_logits(ctx);
for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones.
int n_vocab = llama_n_vocab(ctx);
std::vector<float> tok_logits(
logits + j * n_vocab,
logits + (j + 1) * n_vocab);
double prob = softmax(tok_logits)[tokens[start + j + 1]];
nll += -std::log(prob);
++count;
}
// perplexity is e^(average negative log-likelihood)
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
}
printf("\n");
}
static bool is_interacting = false;
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
void sigint_handler(int signo) {
set_console_state(CONSOLE_STATE_DEFAULT);
printf("\n"); // this also force flush stdout.
if (signo == SIGINT) {
if (!is_interacting) {
is_interacting=true;
} else {
_exit(130);
}
}
}
#endif
int main(int argc, char ** argv) {
// has to be called once at the start of the program to init ggml stuff
ggml_time_init();
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
if (params.seed <= 0) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
// save choice to use color for later
// (note for later: this is a slightly awkward choice)
con_use_color = params.use_color;
// params.prompt = R"(// this function checks if the number n is prime
//bool is_prime(int n) {)";
llama_context * ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
}
// determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
// uncomment the "used_mem" line in llama.cpp to see the results
if (params.mem_test) {
{
const std::vector<llama_token> tmp(params.n_batch, 0);
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
}
{
const std::vector<llama_token> tmp = { 0, };
llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
}
llama_print_timings(ctx);
llama_free(ctx);
return 0;
}
if (params.perplexity) {
perplexity(ctx, params);
exit(0);
}
int n_past = 0;
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx);
params.n_predict = std::min(params.n_predict, n_ctx - (int) embd_inp.size());
// prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
// in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) {
params.interactive = true;
params.antiprompt.push_back("### Instruction:\n\n");
}
// enable interactive mode if reverse prompt is specified
if (params.antiprompt.size() != 0) {
params.interactive = true;
}
if (params.interactive_start) {
params.interactive = true;
}
// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
}
fprintf(stderr, "\n");
if (params.interactive) {
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = sigint_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
#elif defined (_WIN32)
signal(SIGINT, sigint_handler);
#endif
fprintf(stderr, "%s: interactive mode on.\n", __func__);
if(params.antiprompt.size()) {
for (auto antiprompt : params.antiprompt) {
fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
}
}
if (!params.input_prefix.empty()) {
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
}
}
fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
fprintf(stderr, "\n\n");
std::vector<llama_token> embd;
int last_n_size = params.repeat_last_n;
std::vector<llama_token> last_n_tokens(last_n_size);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
if (params.interactive) {
fprintf(stderr, "== Running in interactive mode. ==\n"
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
" - Press Ctrl+C to interject at any time.\n"
#endif
" - Press Return to return control to LLaMa.\n"
" - If you want to submit another line, end your input in '\\'.\n\n");
is_interacting = params.interactive_start || params.instruct;
}
int input_consumed = 0;
bool input_noecho = false;
int remaining_tokens = params.n_predict;
#if defined (_WIN32)
if (params.use_color) {
// Enable ANSI colors on Windows 10+
unsigned long dwMode = 0;
void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) {
SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
}
}
#endif
// the first thing we will do is to output the prompt, so set color accordingly
set_console_state(CONSOLE_STATE_PROMPT);
if (params.embedding){
embd = embd_inp;
if (embd.size() > 0) {
if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
const auto embeddings = llama_get_embeddings(ctx);
// TODO: print / use the embeddings
if (params.use_color) {
printf(ANSI_COLOR_RESET);
}
return 0;
}
while (remaining_tokens > 0 || params.interactive) {
// predict
if (embd.size() > 0) {
if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
n_past += embd.size();
embd.clear();
if ((int) embd_inp.size() <= input_consumed && !is_interacting) {
// out of user input, sample next token
const float top_k = params.top_k;
const float top_p = params.top_p;
const float temp = params.temp;
const float repeat_penalty = params.repeat_penalty;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx);
if (params.ignore_eos) {
// set the logit of the eos token to zero to avoid sampling it
//logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0;
// TODO: this does not work of params.logits_all == true
assert(params.perplexity == false);
logits[llama_token_eos()] = 0;
}
id = llama_sample_top_p_top_k(ctx, last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_penalty);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
}
// replace end of text token with newline token when in interactive mode
if (id == llama_token_eos() && params.interactive && !params.instruct) {
id = llama_token_newline.front();
if (params.antiprompt.size() != 0) {
// tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
}
}
// add it to the context
embd.push_back(id);
// echo this to console
input_noecho = false;
// decrement remaining sampling budget
--remaining_tokens;
} else {
// some user input remains from prompt or interaction, forward it to processing
while ((int) embd_inp.size() > input_consumed) {
embd.push_back(embd_inp[input_consumed]);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(embd_inp[input_consumed]);
++input_consumed;
if ((int) embd.size() >= params.n_batch) {
break;
}
}
}
// display text
if (!input_noecho) {
for (auto id : embd) {
printf("%s", llama_token_to_str(ctx, id));
}
fflush(stdout);
}
// reset color to default if we there is no pending user input
if (!input_noecho && (int)embd_inp.size() == input_consumed) {
set_console_state(CONSOLE_STATE_DEFAULT);
}
// in interactive mode, and not currently processing queued inputs;
// check if we should prompt the user for more
if (params.interactive && (int) embd_inp.size() <= input_consumed) {
// check for reverse prompt
std::string last_output;
for (auto id : last_n_tokens) {
last_output += llama_token_to_str(ctx, id);
}
// Check if each of the reverse prompts appears at the end of the output.
for (std::string & antiprompt : params.antiprompt) {
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
is_interacting = true;
set_console_state(CONSOLE_STATE_USER_INPUT);
fflush(stdout);
break;
}
}
if (n_past > 0 && is_interacting) {
// potentially set color to indicate we are taking user input
set_console_state(CONSOLE_STATE_USER_INPUT);
if (params.instruct) {
input_consumed = embd_inp.size();
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
printf("\n> ");
}
std::string buffer;
if (!params.input_prefix.empty()) {
buffer += params.input_prefix;
printf("%s", buffer.c_str());
}
std::string line;
bool another_line = true;
do {
std::getline(std::cin, line);
if (line.empty() || line.back() != '\\') {
another_line = false;
} else {
line.pop_back(); // Remove the continue character
}
buffer += line + '\n'; // Append the line to the result
} while (another_line);
// done taking input, reset color
set_console_state(CONSOLE_STATE_DEFAULT);
auto line_inp = ::llama_tokenize(ctx, buffer, false);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
if (params.instruct) {
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
}
remaining_tokens -= line_inp.size();
input_noecho = true; // do not echo this again
}
if (n_past > 0) {
is_interacting = false;
}
}
// end of text token
if (embd.back() == llama_token_eos()) {
if (params.instruct) {
is_interacting = true;
} else {
fprintf(stderr, " [end of text]\n");
break;
}
}
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
if (params.interactive && remaining_tokens <= 0) {
remaining_tokens = params.n_predict;
is_interacting = true;
}
}
#if defined (_WIN32)
signal(SIGINT, SIG_DFL);
#endif
llama_print_timings(ctx);
llama_free(ctx);
set_console_state(CONSOLE_STATE_DEFAULT);
return 0;
}

View File

@@ -74,6 +74,10 @@ def main():
args.models_path, model, "ggml-model-f16.bin"
)
if not os.path.isfile(f16_model_path_base):
print(f'The file %s was not found' % f16_model_path_base)
sys.exit(1)
f16_model_parts_paths = map(
lambda filename: os.path.join(f16_model_path_base, filename),
glob.glob(f"{f16_model_path_base}*")

1
spm-headers/llama.h Symbolic link
View File

@@ -0,0 +1 @@
../llama.h

View File

@@ -1,7 +1,7 @@
function(llama_add_test source)
get_filename_component(TEST_TARGET ${source} NAME_WE)
add_executable(${TEST_TARGET} ${source})
target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
target_link_libraries(${TEST_TARGET} PRIVATE llama)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
endfunction()

View File

@@ -13,7 +13,7 @@ int main(void) {
src[i] = (float)(i + 1);
}
size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist);
size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist);
assert(size == 20);
float max_result = ((float *)dst)[0];
float max_expected = src[31] / ((1 << 3) - 1);
@@ -24,7 +24,7 @@ int main(void) {
assert(q4_result == q4_expected);
}
size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist);
size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
assert(size == 24);
float delta_result = ((float *)dst)[0];
float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);

View File

@@ -1,9 +1,9 @@
#include "utils.h"
#include "llama.h"
#include <cstdio>
#include <string>
#include <map>
#include <vector>
static const std::map<std::string, std::vector<llama_token>> k_tests = {
{ "Hello World", { 1, 10994, 2787, }, },
@@ -48,7 +48,9 @@ int main(int argc, char **argv) {
}
for (const auto & test_kv : k_tests) {
const auto res = ::llama_tokenize(ctx, test_kv.first, true);
std::vector<llama_token> res(test_kv.first.size());
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true);
res.resize(n);
bool correct = res.size() == test_kv.second.size();