mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
39 Commits
master-162
...
master-884
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
884e7d7a2b | ||
|
|
7cd5c4a3e9 | ||
|
|
f3d4edf504 | ||
|
|
8944a13296 | ||
|
|
6667401238 | ||
|
|
77a73403ca | ||
|
|
50a8a2af97 | ||
|
|
4caebf6d40 | ||
|
|
dcdd65e296 | ||
|
|
5ecff35151 | ||
|
|
7faa7460f0 | ||
|
|
5af8e32238 | ||
|
|
42747220b4 | ||
|
|
e9298af389 | ||
|
|
4ad73137a1 | ||
|
|
315a95a4d3 | ||
|
|
efd05648c8 | ||
|
|
eb17a026fd | ||
|
|
69b740289f | ||
|
|
f266259ad9 | ||
|
|
47f61aaa5f | ||
|
|
3173a62eb9 | ||
|
|
489537e6cf | ||
|
|
2d3481c721 | ||
|
|
74f5899df4 | ||
|
|
2f7c8e014e | ||
|
|
0ad964631f | ||
|
|
e95b6554b4 | ||
|
|
aa485cee33 | ||
|
|
c12b14b77f | ||
|
|
106faaf297 | ||
|
|
c85e03d12e | ||
|
|
489093548c | ||
|
|
93265e988a | ||
|
|
c56b715269 | ||
|
|
f4d277ae17 | ||
|
|
c9a59b70a5 | ||
|
|
a32f7acc9f | ||
|
|
43ffdefb74 |
@@ -5,9 +5,10 @@ FROM ubuntu:$UBUNTU_VERSION as build
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential python3 python3-pip
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install numpy requests sentencepiece tqdm \
|
||||
&& pip install torch --index-url https://download.pytorch.org/whl/cpu
|
||||
&& pip install -r requirements.txt
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
14
.github/workflows/build.yml
vendored
14
.github/workflows/build.yml
vendored
@@ -8,6 +8,8 @@ on:
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
|
||||
pull_request:
|
||||
types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
|
||||
@@ -18,6 +20,8 @@ env:
|
||||
|
||||
jobs:
|
||||
ubuntu-latest-make:
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
@@ -37,6 +41,8 @@ jobs:
|
||||
make
|
||||
|
||||
ubuntu-latest-cmake:
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
@@ -65,6 +71,8 @@ jobs:
|
||||
ctest --verbose
|
||||
|
||||
ubuntu-latest-cmake-sanitizer:
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
continue-on-error: true
|
||||
@@ -101,6 +109,8 @@ jobs:
|
||||
ctest --verbose
|
||||
|
||||
macOS-latest-make:
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
@@ -119,6 +129,8 @@ jobs:
|
||||
make
|
||||
|
||||
macOS-latest-cmake:
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: macOS-latest
|
||||
|
||||
steps:
|
||||
@@ -146,6 +158,8 @@ jobs:
|
||||
ctest --verbose
|
||||
|
||||
windows-latest-cmake:
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
|
||||
2
.github/workflows/docker.yml
vendored
2
.github/workflows/docker.yml
vendored
@@ -18,6 +18,8 @@ on:
|
||||
jobs:
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Hub
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
|
||||
16
.gitignore
vendored
16
.gitignore
vendored
@@ -1,11 +1,15 @@
|
||||
*.o
|
||||
*.a
|
||||
.DS_Store
|
||||
.build/
|
||||
.cache/
|
||||
.direnv/
|
||||
.envrc
|
||||
.swiftpm
|
||||
.venv
|
||||
.vs/
|
||||
.vscode/
|
||||
.DS_Store
|
||||
|
||||
.build/
|
||||
build/
|
||||
build-em/
|
||||
build-debug/
|
||||
@@ -24,17 +28,15 @@ models/*
|
||||
/perplexity
|
||||
/embedding
|
||||
/benchmark-q4_0-matmult
|
||||
/vdot
|
||||
/Pipfile
|
||||
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
|
||||
.envrc
|
||||
.direnv/
|
||||
|
||||
.venv
|
||||
__pycache__
|
||||
.swiftpm
|
||||
|
||||
zig-out/
|
||||
zig-cache/
|
||||
|
||||
ppl-*.txt
|
||||
|
||||
@@ -55,6 +55,8 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer"
|
||||
option(LLAMA_AVX "llama: enable AVX" ON)
|
||||
option(LLAMA_AVX2 "llama: enable AVX2" ON)
|
||||
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
|
||||
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
|
||||
option(LLAMA_FMA "llama: enable FMA" ON)
|
||||
# in MSVC F16C is implied with AVX2/AVX512
|
||||
if (NOT MSVC)
|
||||
@@ -64,6 +66,7 @@ endif()
|
||||
# 3rd party libs
|
||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
|
||||
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
||||
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
@@ -120,11 +123,46 @@ if (LLAMA_OPENBLAS)
|
||||
add_compile_definitions(GGML_USE_OPENBLAS)
|
||||
add_link_options(${BLAS_LIBRARIES})
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
|
||||
|
||||
# find header file
|
||||
set(OPENBLAS_INCLUDE_SEARCH_PATHS
|
||||
/usr/include
|
||||
/usr/include/openblas
|
||||
/usr/include/openblas-base
|
||||
/usr/local/include
|
||||
/usr/local/include/openblas
|
||||
/usr/local/include/openblas-base
|
||||
/opt/OpenBLAS/include
|
||||
$ENV{OpenBLAS_HOME}
|
||||
$ENV{OpenBLAS_HOME}/include
|
||||
)
|
||||
find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
|
||||
add_compile_options(-I${OPENBLAS_INC})
|
||||
else()
|
||||
message(WARNING "OpenBLAS not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_CUBLAS)
|
||||
cmake_minimum_required(VERSION 3.17)
|
||||
|
||||
find_package(CUDAToolkit)
|
||||
if (CUDAToolkit_FOUND)
|
||||
message(STATUS "cuBLAS found")
|
||||
|
||||
add_compile_definitions(GGML_USE_CUBLAS)
|
||||
|
||||
if (LLAMA_STATIC)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
else()
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||
endif()
|
||||
|
||||
else()
|
||||
message(WARNING "cuBLAS not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_ALL_WARNINGS)
|
||||
if (NOT MSVC)
|
||||
set(c_flags
|
||||
@@ -136,7 +174,6 @@ if (LLAMA_ALL_WARNINGS)
|
||||
-Wshadow
|
||||
-Wstrict-prototypes
|
||||
-Wpointer-arith
|
||||
-Wno-unused-function
|
||||
)
|
||||
set(cxx_flags
|
||||
-Wall
|
||||
@@ -205,6 +242,16 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
|
||||
if (MSVC)
|
||||
if (LLAMA_AVX512)
|
||||
add_compile_options(/arch:AVX512)
|
||||
# MSVC has no compile-time flags enabling specific
|
||||
# AVX512 extensions, neither it defines the
|
||||
# macros corresponding to the extensions.
|
||||
# Do it manually.
|
||||
if (LLAMA_AVX512_VBMI)
|
||||
add_compile_definitions(__AVX512VBMI__)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VNNI)
|
||||
add_compile_definitions(__AVX512VNNI__)
|
||||
endif()
|
||||
elseif (LLAMA_AVX2)
|
||||
add_compile_options(/arch:AVX2)
|
||||
elseif (LLAMA_AVX)
|
||||
@@ -225,9 +272,13 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
|
||||
endif()
|
||||
if (LLAMA_AVX512)
|
||||
add_compile_options(-mavx512f)
|
||||
# add_compile_options(-mavx512cd)
|
||||
# add_compile_options(-mavx512dq)
|
||||
# add_compile_options(-mavx512bw)
|
||||
add_compile_options(-mavx512bw)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VBMI)
|
||||
add_compile_options(-mavx512vbmi)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VNNI)
|
||||
add_compile_options(-mavx512vnni)
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
@@ -274,4 +325,5 @@ endif ()
|
||||
|
||||
if (LLAMA_BUILD_EXAMPLES)
|
||||
add_subdirectory(examples)
|
||||
add_subdirectory(pocs)
|
||||
endif()
|
||||
|
||||
35
Makefile
35
Makefile
@@ -36,7 +36,7 @@ CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
|
||||
LDFLAGS =
|
||||
|
||||
# warnings
|
||||
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
|
||||
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
|
||||
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
|
||||
|
||||
# OS specific
|
||||
@@ -97,6 +97,10 @@ ifdef LLAMA_OPENBLAS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
|
||||
LDFLAGS += -lopenblas
|
||||
endif
|
||||
ifdef LLAMA_CUBLAS
|
||||
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
|
||||
LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -L/usr/local/cuda/lib64
|
||||
endif
|
||||
ifdef LLAMA_GPROF
|
||||
CFLAGS += -pg
|
||||
CXXFLAGS += -pg
|
||||
@@ -133,51 +137,54 @@ $(info I CC: $(CCV))
|
||||
$(info I CXX: $(CXXV))
|
||||
$(info )
|
||||
|
||||
default: main quantize perplexity embedding
|
||||
default: main quantize quantize-stats perplexity embedding vdot
|
||||
|
||||
#
|
||||
# Build library
|
||||
#
|
||||
|
||||
ggml.o: ggml.c ggml.h
|
||||
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
llama.o: llama.cpp llama.h llama_util.h
|
||||
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
|
||||
llama.o: llama.cpp ggml.h llama.h llama_util.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common.o: examples/common.cpp examples/common.h
|
||||
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
clean:
|
||||
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
|
||||
|
||||
main: examples/main/main.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
@echo
|
||||
@echo '==== Run ./main -h for help. ===='
|
||||
@echo
|
||||
|
||||
quantize: examples/quantize/quantize.cpp ggml.o llama.o
|
||||
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
|
||||
$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
vdot: pocs/vdot/vdot.cpp ggml.o
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
libllama.so: llama.o ggml.o
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
#
|
||||
# Tests
|
||||
#
|
||||
|
||||
benchmark: ggml.o
|
||||
$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS)
|
||||
benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o
|
||||
$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
|
||||
./benchmark-q4_0-matmult
|
||||
|
||||
.PHONY: tests
|
||||
|
||||
@@ -7,8 +7,13 @@
|
||||
|
||||
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
**Warnings**
|
||||
|
||||
- `Q4_2` and `Q4_3` are still in development. Do not expect any kind of backward compatibility until they are finalize
|
||||
|
||||
**Hot topics:**
|
||||
|
||||
- [Added LoRA support](https://github.com/ggerganov/llama.cpp/pull/820)
|
||||
- [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915)
|
||||
- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
|
||||
|
||||
@@ -50,6 +55,7 @@ New features will probably be added mostly through community contributions.
|
||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
|
||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||
|
||||
**UI:**
|
||||
|
||||
|
||||
124
convert-lora-to-ggml.py
Normal file
124
convert-lora-to-ggml.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import struct
|
||||
import sys
|
||||
from typing import Any, Dict, Sequence, TextIO
|
||||
|
||||
import torch
|
||||
|
||||
from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
|
||||
|
||||
HF_SUBLAYER_TO_GGML = {
|
||||
"self_attn.q_proj": "attention.wq",
|
||||
"self_attn.k_proj": "attention.wk",
|
||||
"self_attn.v_proj": "attention.wv",
|
||||
"self_attn.o_proj": "attention.wo",
|
||||
"mlp.gate_proj": "feed_forward.w1",
|
||||
"mlp.down_proj": "feed_forward.w2",
|
||||
"mlp.up_proj": "feed_forward.w3",
|
||||
"input_layernorm": "attention_norm",
|
||||
"post_attention_layernorm": "ffn_norm",
|
||||
# "norm": "norm",
|
||||
# "embed_tokens": "tok_embeddings",
|
||||
# "lm_head": "output",
|
||||
}
|
||||
|
||||
|
||||
def translate_tensor_name(t: str) -> str:
|
||||
match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
|
||||
if match:
|
||||
nn = match.group(1)
|
||||
sub_layer = match.group(2)
|
||||
lora_type = match.group(3)
|
||||
|
||||
sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
|
||||
if sub_layer_renamed is None:
|
||||
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
|
||||
sys.exit(1)
|
||||
|
||||
output_string = (
|
||||
f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
|
||||
)
|
||||
return output_string
|
||||
else:
|
||||
print(f"Error: unrecognized tensor {t}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
|
||||
fout.write(b"ggla"[::-1]) # magic (ggml lora)
|
||||
fout.write(struct.pack("i", 1)) # file version
|
||||
fout.write(struct.pack("ii", params["r"], params["lora_alpha"]))
|
||||
|
||||
|
||||
def write_tensor_header(
|
||||
self, name: str, shape: Sequence[int], data_type: DataType
|
||||
) -> None:
|
||||
sname = name.encode("utf-8")
|
||||
fout.write(
|
||||
struct.pack(
|
||||
"iii",
|
||||
len(shape),
|
||||
len(sname),
|
||||
DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
|
||||
)
|
||||
)
|
||||
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
||||
fout.write(sname)
|
||||
fout.seek((fout.tell() + 31) & -32)
|
||||
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print(f"Usage: python {sys.argv[0]} <path>")
|
||||
print(
|
||||
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
input_json = os.path.join(sys.argv[1], "adapter_config.json")
|
||||
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
|
||||
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
|
||||
|
||||
model = torch.load(input_model, map_location="cpu")
|
||||
|
||||
with open(input_json, "r") as f:
|
||||
params = json.load(f)
|
||||
|
||||
if params["peft_type"] != "LORA":
|
||||
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
|
||||
sys.exit(1)
|
||||
|
||||
if params["fan_in_fan_out"] == True:
|
||||
print("Error: param fan_in_fan_out is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
if params["bias"] is not None and params["bias"] != "none":
|
||||
print("Error: param bias is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
# TODO: these seem to be layers that have been trained but without lora.
|
||||
# doesn't seem widely used but eventually should be supported
|
||||
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
|
||||
print("Error: param modules_to_save is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
with open(output_path, "wb") as fout:
|
||||
fout.truncate()
|
||||
|
||||
write_file_header(fout, params)
|
||||
for k, v in model.items():
|
||||
if k.endswith("lora_A.weight"):
|
||||
if v.dtype != torch.float16 and v.dtype != torch.float32:
|
||||
v = v.float()
|
||||
v = v.T
|
||||
else:
|
||||
v = v.float()
|
||||
|
||||
t = v.numpy()
|
||||
tname = translate_tensor_name(k)
|
||||
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
|
||||
write_tensor_header(fout, tname, t.shape, t.dtype)
|
||||
t.tofile(fout)
|
||||
|
||||
print(f"Converted {input_json} and {input_model} to {output_path}")
|
||||
22
convert.py
22
convert.py
@@ -18,10 +18,12 @@ import zipfile
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List,
|
||||
Literal, Optional, Sequence, Tuple, TypeVar, Union)
|
||||
|
||||
import numpy as np
|
||||
from sentencepiece import SentencePieceProcessor # type: ignore
|
||||
from typing import (IO, Any, Callable, Iterable, Literal, Optional, Sequence,
|
||||
TypeVar, Union, List, Dict, Tuple, TYPE_CHECKING)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
@@ -684,7 +686,7 @@ class LazyUnpickler(pickle.Unpickler):
|
||||
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
|
||||
return LazyStorage(load=load, kind=pid[1], description=description)
|
||||
|
||||
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName]
|
||||
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName]
|
||||
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
|
||||
assert isinstance(storage, LazyStorage)
|
||||
|
||||
@@ -733,7 +735,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
|
||||
header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
|
||||
# Use mmap for the actual data to avoid race conditions with the file offset.
|
||||
mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
|
||||
byte_buf = mapped[fp.tell():]
|
||||
byte_buf = mapped[8 + header_size:]
|
||||
|
||||
def convert(info: Dict[str, Any]) -> LazyTensor:
|
||||
data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
|
||||
@@ -759,7 +761,7 @@ def must_read(fp: IO[bytes], length: int) -> bytes:
|
||||
return ret
|
||||
|
||||
|
||||
def lazy_load_ggml_file(fp: IO[bytes], path: Path) -> ModelPlus:
|
||||
def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
|
||||
magic = must_read(fp, 4)[::-1]
|
||||
if magic in (b'ggmf', b'ggjt'):
|
||||
version, = struct.unpack("i", must_read(fp, 4))
|
||||
@@ -793,7 +795,9 @@ def lazy_load_ggml_file(fp: IO[bytes], path: Path) -> ModelPlus:
|
||||
|
||||
model: LazyModel = {}
|
||||
# Use mmap for the actual data to avoid race conditions with the file offset.
|
||||
off = fp.raw.tell()
|
||||
mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
|
||||
fp.raw.seek(off) # needed on Windows
|
||||
|
||||
def read_tensor() -> None: # this is a function so that variables captured in `load` don't change
|
||||
shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
|
||||
@@ -947,8 +951,9 @@ class OutputFile:
|
||||
|
||||
ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
|
||||
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
|
||||
size = ' x '.join(map(str, lazy_tensor.shape))
|
||||
print(f"[{i+1}/{len(model)}] Writing tensor {name}, size {size}...")
|
||||
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
||||
padi = len(str(len(model)))
|
||||
print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
|
||||
of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type)
|
||||
ndarray.tofile(of.fout)
|
||||
of.fout.close()
|
||||
@@ -1080,6 +1085,7 @@ def default_outfile(model_paths: List[Path], params: Params) -> Path:
|
||||
namestr = {
|
||||
GGMLFileType.AllF32: "f32",
|
||||
GGMLFileType.MostlyF16: "f16",
|
||||
GGMLFileType.MostlyQ4_0: "q4_0",
|
||||
GGMLFileType.MostlyQ4_1: "q4_1",
|
||||
GGMLFileType.PerLayerIsQ4_1: "q4_1",
|
||||
}[params.file_type]
|
||||
@@ -1103,7 +1109,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
|
||||
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
||||
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
parser.add_argument("--outtype", choices=["f32", "f16", "q4_1"], help="output format (default: based on input)")
|
||||
parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
|
||||
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
|
||||
float tensor_sum_elements(struct ggml_tensor * tensor) {
|
||||
float sum = 0;
|
||||
if (tensor->type==6) {
|
||||
if (tensor->type==GGML_TYPE_F32) {
|
||||
for (int j = 0; j < tensor->ne[1]; j++) {
|
||||
for (int k = 0; k < tensor->ne[0]; k++) {
|
||||
sum += ((float *) tensor->data)[j*tensor->ne[0]+k];
|
||||
|
||||
@@ -139,6 +139,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.model = argv[i];
|
||||
} else if (arg == "--lora") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter = argv[i];
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--lora-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_base = argv[i];
|
||||
} else if (arg == "-i" || arg == "--interactive") {
|
||||
params.interactive = true;
|
||||
} else if (arg == "--embedding") {
|
||||
@@ -242,6 +255,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
}
|
||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
|
||||
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
@@ -31,11 +31,12 @@ struct gpt_params {
|
||||
|
||||
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
|
||||
std::string prompt = "";
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
|
||||
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
|
||||
std::string lora_adapter = ""; // lora adapter path
|
||||
std::string lora_base = ""; // base model path for the lora adapter
|
||||
|
||||
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
||||
bool random_prompt = false; // do not randomize prompt if none provided
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <ctime>
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
params.model = "models/llama-7B/ggml-model.bin";
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
@@ -113,6 +114,17 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.lora_adapter.empty()) {
|
||||
int err = llama_apply_lora_from_file(ctx,
|
||||
params.lora_adapter.c_str(),
|
||||
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
|
||||
params.n_threads);
|
||||
if (err != 0) {
|
||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <ctime>
|
||||
|
||||
std::vector<float> softmax(const std::vector<float>& logits) {
|
||||
std::vector<float> probs(logits.size());
|
||||
@@ -133,6 +134,17 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.lora_adapter.empty()) {
|
||||
int err = llama_apply_lora_from_file(ctx,
|
||||
params.lora_adapter.c_str(),
|
||||
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
|
||||
params.n_threads);
|
||||
if (err != 0) {
|
||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
@@ -16,9 +16,6 @@
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" };
|
||||
static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
|
||||
|
||||
struct quantize_stats_params {
|
||||
std::string model = "models/7B/ggml-model-f16.bin";
|
||||
bool verbose = false;
|
||||
@@ -224,7 +221,7 @@ int main(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
int j;
|
||||
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) {
|
||||
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) j)) != 0; j++) {
|
||||
// find match
|
||||
}
|
||||
if (j < GGML_TYPE_COUNT) {
|
||||
@@ -279,7 +276,7 @@ int main(int argc, char ** argv) {
|
||||
continue;
|
||||
}
|
||||
if (params.verbose) {
|
||||
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second));
|
||||
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
|
||||
}
|
||||
if (kv_tensor.second->type == GGML_TYPE_F16) {
|
||||
is_f16 = true;
|
||||
@@ -304,13 +301,14 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// loop throught quantization types
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
const ggml_type type = (ggml_type) i;
|
||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
||||
continue;
|
||||
}
|
||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
||||
if (params.verbose) {
|
||||
printf("testing %s ...\n", type_strs[i]);
|
||||
printf("testing %s ...\n", ggml_type_name(type));
|
||||
}
|
||||
|
||||
error_stats global_stats {};
|
||||
@@ -322,7 +320,7 @@ int main(int argc, char ** argv) {
|
||||
if (params.verbose) {
|
||||
printf(" %s ...\n", kv_tensor.first.c_str());
|
||||
}
|
||||
std::string layer_name { type_strs[i] };
|
||||
std::string layer_name { ggml_type_name(type) };
|
||||
layer_name += "::" + kv_tensor.first;
|
||||
test_roundtrip_on_layer(
|
||||
layer_name,
|
||||
@@ -337,7 +335,7 @@ int main(int argc, char ** argv) {
|
||||
);
|
||||
}
|
||||
|
||||
print_error_stats(type_strs[i], global_stats, params.print_histogram);
|
||||
print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
|
||||
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
|
||||
fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
inherit system;
|
||||
};
|
||||
llama-python = pkgs.python310.withPackages (ps: with ps; [
|
||||
torch
|
||||
numpy
|
||||
sentencepiece
|
||||
]);
|
||||
|
||||
33
ggml.h
33
ggml.h
@@ -204,6 +204,8 @@ enum ggml_type {
|
||||
GGML_TYPE_F16 = 1,
|
||||
GGML_TYPE_Q4_0 = 2,
|
||||
GGML_TYPE_Q4_1 = 3,
|
||||
GGML_TYPE_Q4_2 = 4,
|
||||
GGML_TYPE_Q8_0 = 5,
|
||||
GGML_TYPE_I8,
|
||||
GGML_TYPE_I16,
|
||||
GGML_TYPE_I32,
|
||||
@@ -253,6 +255,9 @@ enum ggml_op {
|
||||
GGML_OP_FLASH_ATTN,
|
||||
GGML_OP_FLASH_FF,
|
||||
|
||||
GGML_OP_MAP_UNARY,
|
||||
GGML_OP_MAP_BINARY,
|
||||
|
||||
GGML_OP_COUNT,
|
||||
};
|
||||
|
||||
@@ -351,6 +356,8 @@ int ggml_blck_size (enum ggml_type type);
|
||||
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
||||
|
||||
const char * ggml_type_name(enum ggml_type type);
|
||||
|
||||
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||
|
||||
struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||
@@ -424,6 +431,12 @@ struct ggml_tensor * ggml_add(
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
|
||||
struct ggml_tensor * ggml_add_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
struct ggml_tensor * ggml_sub(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -652,6 +665,21 @@ struct ggml_tensor * ggml_flash_ff(
|
||||
struct ggml_tensor * c0,
|
||||
struct ggml_tensor * c1);
|
||||
|
||||
// Mapping operations
|
||||
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
||||
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
||||
|
||||
struct ggml_tensor * ggml_map_unary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
const ggml_unary_op_f32_t fun);
|
||||
|
||||
struct ggml_tensor * ggml_map_binary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
const ggml_binary_op_f32_t fun);
|
||||
|
||||
//
|
||||
// automatic differentiation
|
||||
//
|
||||
@@ -779,6 +807,7 @@ enum ggml_opt_result ggml_opt(
|
||||
|
||||
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
//
|
||||
// system info
|
||||
@@ -787,6 +816,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
|
||||
int ggml_cpu_has_avx(void);
|
||||
int ggml_cpu_has_avx2(void);
|
||||
int ggml_cpu_has_avx512(void);
|
||||
int ggml_cpu_has_avx512_vbmi(void);
|
||||
int ggml_cpu_has_avx512_vnni(void);
|
||||
int ggml_cpu_has_fma(void);
|
||||
int ggml_cpu_has_neon(void);
|
||||
int ggml_cpu_has_arm_fma(void);
|
||||
@@ -794,6 +825,7 @@ int ggml_cpu_has_f16c(void);
|
||||
int ggml_cpu_has_fp16_va(void);
|
||||
int ggml_cpu_has_wasm_simd(void);
|
||||
int ggml_cpu_has_blas(void);
|
||||
int ggml_cpu_has_cublas(void);
|
||||
int ggml_cpu_has_sse3(void);
|
||||
int ggml_cpu_has_vsx(void);
|
||||
|
||||
@@ -816,6 +848,7 @@ typedef struct {
|
||||
dequantize_row_q_t dequantize_row_q;
|
||||
quantize_row_q_t quantize_row_q;
|
||||
quantize_row_q_t quantize_row_q_reference;
|
||||
quantize_row_q_t quantize_row_q_dot;
|
||||
vec_dot_q_t vec_dot_q;
|
||||
} quantize_fns_t;
|
||||
|
||||
|
||||
388
llama.cpp
388
llama.cpp
@@ -1,6 +1,8 @@
|
||||
// Defines fileno on msys:
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#endif
|
||||
|
||||
#include "llama_util.h"
|
||||
@@ -9,6 +11,7 @@
|
||||
#include "ggml.h"
|
||||
|
||||
#include <array>
|
||||
#include <ctime>
|
||||
#include <cinttypes>
|
||||
#include <fstream>
|
||||
#include <random>
|
||||
@@ -41,35 +44,51 @@ static const size_t MB = 1024*1024;
|
||||
// TODO: dynamically determine these sizes
|
||||
// needs modifications in ggml
|
||||
|
||||
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
|
||||
{ MODEL_7B, 512ull*MB },
|
||||
{ MODEL_13B, 512ull*MB },
|
||||
{ MODEL_30B, 512ull*MB },
|
||||
{ MODEL_65B, 512ull*MB },
|
||||
};
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||
{
|
||||
static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
|
||||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 512ull * MB },
|
||||
};
|
||||
return _MEM_REQ_SCRATCH0;
|
||||
}
|
||||
|
||||
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
|
||||
{ MODEL_7B, 512ull*MB },
|
||||
{ MODEL_13B, 512ull*MB },
|
||||
{ MODEL_30B, 512ull*MB },
|
||||
{ MODEL_65B, 512ull*MB },
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
||||
{
|
||||
static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
|
||||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 512ull * MB },
|
||||
};
|
||||
return _MEM_REQ_SCRATCH1;
|
||||
};
|
||||
|
||||
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
||||
static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
|
||||
{ MODEL_7B, 1026ull*MB },
|
||||
{ MODEL_13B, 1608ull*MB },
|
||||
{ MODEL_30B, 3124ull*MB },
|
||||
{ MODEL_65B, 5120ull*MB },
|
||||
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
||||
{
|
||||
static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
|
||||
{ MODEL_7B, 1026ull * MB },
|
||||
{ MODEL_13B, 1608ull * MB },
|
||||
{ MODEL_30B, 3124ull * MB },
|
||||
{ MODEL_65B, 5120ull * MB },
|
||||
};
|
||||
return _MEM_REQ_KV_SELF;
|
||||
};
|
||||
|
||||
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
||||
// not actually needed if BLAS is disabled
|
||||
static const std::map<e_model, size_t> MEM_REQ_EVAL = {
|
||||
{ MODEL_7B, 768ull*MB },
|
||||
{ MODEL_13B, 1024ull*MB },
|
||||
{ MODEL_30B, 1280ull*MB },
|
||||
{ MODEL_65B, 1536ull*MB },
|
||||
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
||||
{
|
||||
static std::map<e_model, size_t> _MEM_REQ_EVAL = {
|
||||
{ MODEL_7B, 768ull * MB },
|
||||
{ MODEL_13B, 1024ull * MB },
|
||||
{ MODEL_30B, 1280ull * MB },
|
||||
{ MODEL_65B, 1536ull * MB },
|
||||
};
|
||||
return _MEM_REQ_EVAL;
|
||||
};
|
||||
|
||||
// default hparams (LLaMA 7B)
|
||||
@@ -261,22 +280,12 @@ static size_t checked_div(size_t a, size_t b) {
|
||||
}
|
||||
|
||||
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
||||
std::string ret = "[" + std::to_string(ne.at(0));
|
||||
char buf[256];
|
||||
snprintf(buf, sizeof(buf), "%5u", ne.at(0));
|
||||
for (size_t i = 1; i < ne.size(); i++) {
|
||||
ret += " x " + std::to_string(ne.at(i));
|
||||
}
|
||||
ret += "]";
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const char * llama_format_type(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32: return "f32";
|
||||
case GGML_TYPE_F16: return "f16";
|
||||
case GGML_TYPE_Q4_0: return "q4_0";
|
||||
case GGML_TYPE_Q4_1: return "q4_1";
|
||||
default: LLAMA_ASSERT(false);
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
||||
@@ -469,6 +478,7 @@ struct llama_file_loader {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_2:
|
||||
break;
|
||||
default: {
|
||||
throw format("unrecognized tensor type %u\n", shard.type);
|
||||
@@ -541,6 +551,7 @@ struct llama_file_saver {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_2:
|
||||
break;
|
||||
default: LLAMA_ASSERT(false);
|
||||
}
|
||||
@@ -626,6 +637,7 @@ struct llama_model_loader {
|
||||
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
||||
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
||||
}
|
||||
|
||||
return get_tensor_for(lt);
|
||||
}
|
||||
|
||||
@@ -828,6 +840,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||
return "mostly Q4_1, some F16";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
}
|
||||
@@ -908,13 +921,13 @@ static void llama_model_load_internal(
|
||||
const size_t mem_required =
|
||||
ctx_size +
|
||||
mmapped_size +
|
||||
MEM_REQ_SCRATCH0.at(model.type) +
|
||||
MEM_REQ_SCRATCH1.at(model.type) +
|
||||
MEM_REQ_EVAL.at (model.type);
|
||||
MEM_REQ_SCRATCH0().at(model.type) +
|
||||
MEM_REQ_SCRATCH1().at(model.type) +
|
||||
MEM_REQ_EVAL().at(model.type);
|
||||
|
||||
// this is the memory required by one llama_state
|
||||
const size_t mem_required_state =
|
||||
scale*MEM_REQ_KV_SELF.at(model.type);
|
||||
scale*MEM_REQ_KV_SELF().at(model.type);
|
||||
|
||||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||
@@ -951,8 +964,8 @@ static void llama_model_load_internal(
|
||||
ml->ggml_ctx = ctx;
|
||||
|
||||
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
||||
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
||||
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
||||
|
||||
model.layers.resize(n_layer);
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
@@ -1056,7 +1069,7 @@ static bool llama_eval_internal(
|
||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||
ggml_cgraph gf = {};
|
||||
gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
|
||||
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
|
||||
|
||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
||||
@@ -1561,6 +1574,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
switch (ftype) {
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
||||
default: throw format("invalid output file type %d\n", ftype);
|
||||
};
|
||||
|
||||
@@ -1579,10 +1593,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
tensor.data = read_data.addr;
|
||||
model_loader->load_data_for(tensor);
|
||||
|
||||
printf("[%zu/%zu] %36s - %s, type = %6s, ",
|
||||
printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
||||
++idx, model_loader->tensors_map.tensors.size(),
|
||||
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
||||
llama_format_type(tensor.type));
|
||||
ggml_type_name(tensor.type));
|
||||
|
||||
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
||||
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
|
||||
@@ -1615,7 +1629,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
||||
}
|
||||
} else {
|
||||
throw format("type %s unsupported for integer quantization", llama_format_type(tensor.type));
|
||||
throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
|
||||
}
|
||||
|
||||
printf("quantizing .. ");
|
||||
@@ -1634,6 +1648,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
{
|
||||
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q4_2:
|
||||
{
|
||||
new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
||||
} break;
|
||||
default:
|
||||
LLAMA_ASSERT(false);
|
||||
}
|
||||
@@ -1741,10 +1759,10 @@ struct llama_context * llama_init_from_file(
|
||||
ctx->embedding.resize(hparams.n_embd);
|
||||
}
|
||||
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
||||
|
||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
|
||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
|
||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
||||
}
|
||||
|
||||
return ctx;
|
||||
@@ -1767,6 +1785,254 @@ int llama_model_quantize(
|
||||
}
|
||||
}
|
||||
|
||||
int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
||||
|
||||
auto & model = ctx->model;
|
||||
|
||||
const int64_t t_start_lora_us = ggml_time_us();
|
||||
|
||||
auto fin = std::ifstream(path_lora, std::ios::binary);
|
||||
if (!fin) {
|
||||
fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// verify magic and version
|
||||
{
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic != 'ggla') {
|
||||
fprintf(stderr, "%s: bad file magic\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
uint32_t format_version;
|
||||
fin.read((char *) &format_version, sizeof(format_version));
|
||||
|
||||
if (format_version != 1) {
|
||||
fprintf(stderr, "%s: unsupported file version\n", __func__ );
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t lora_r;
|
||||
int32_t lora_alpha;
|
||||
fin.read((char *) &lora_r, sizeof(lora_r));
|
||||
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
||||
float scaling = (float)lora_alpha / (float)lora_r;
|
||||
|
||||
fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
||||
|
||||
|
||||
// create a temporary ggml context to store the lora tensors
|
||||
// todo: calculate size from biggest possible tensor
|
||||
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = lora_buf.size();
|
||||
params.mem_buffer = lora_buf.data();
|
||||
params.no_alloc = false;
|
||||
|
||||
ggml_context * lora_ctx = ggml_init(params);
|
||||
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
||||
|
||||
// create a name -> tensor map of the model to accelerate lookups
|
||||
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
||||
for (auto & kv: model.tensors_by_name) {
|
||||
model_tensors.insert(kv);
|
||||
}
|
||||
|
||||
|
||||
// load base model
|
||||
std::unique_ptr<llama_model_loader> model_loader;
|
||||
ggml_context * base_ctx = NULL;
|
||||
llama_buffer base_buf;
|
||||
if (path_base_model) {
|
||||
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
||||
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
||||
|
||||
size_t ctx_size, mmapped_size;
|
||||
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
||||
base_buf.resize(ctx_size);
|
||||
|
||||
ggml_init_params base_params;
|
||||
base_params.mem_size = base_buf.size;
|
||||
base_params.mem_buffer = base_buf.addr;
|
||||
base_params.no_alloc = model_loader->use_mmap;
|
||||
|
||||
base_ctx = ggml_init(base_params);
|
||||
|
||||
model_loader->ggml_ctx = base_ctx;
|
||||
|
||||
// maybe this should in llama_model_loader
|
||||
if (model_loader->use_mmap) {
|
||||
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
|
||||
}
|
||||
}
|
||||
|
||||
// read tensors and apply
|
||||
bool warned = false;
|
||||
int n_tensors = 0;
|
||||
while (true) {
|
||||
int32_t n_dims;
|
||||
int32_t length;
|
||||
int32_t ftype;
|
||||
|
||||
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
||||
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
||||
if (fin.eof()) {
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t ne[2] = { 1, 1 };
|
||||
for (int i = 0; i < n_dims; ++i) {
|
||||
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||
}
|
||||
|
||||
std::string name(length, 0);
|
||||
fin.read(&name[0], length);
|
||||
|
||||
// check for lora suffix and get the type of tensor
|
||||
const std::string lora_suffix = ".lora";
|
||||
size_t pos = name.rfind(lora_suffix);
|
||||
if (pos == std::string::npos) {
|
||||
fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::string lora_type = name.substr(pos + lora_suffix.length());
|
||||
std::string base_name = name;
|
||||
base_name.erase(pos);
|
||||
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
||||
|
||||
if (model_tensors.find(base_name.data()) == model_tensors.end()) {
|
||||
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
||||
return 1;
|
||||
}
|
||||
|
||||
// create ggml tensor
|
||||
ggml_type wtype;
|
||||
switch (ftype) {
|
||||
case 0: wtype = GGML_TYPE_F32; break;
|
||||
case 1: wtype = GGML_TYPE_F16; break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: invalid tensor data type '%d'\n",
|
||||
__func__, ftype);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
ggml_tensor* lora_tensor;
|
||||
if (n_dims == 2) {
|
||||
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// load tensor data
|
||||
size_t offset = fin.tellg();
|
||||
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
||||
offset = (offset + 31) & -32;
|
||||
fin.seekg(offset);
|
||||
fin.read((char*)lora_tensor->data, tensor_data_size);
|
||||
|
||||
lora_tensors[name] = lora_tensor;
|
||||
|
||||
// check if we have both A and B tensors and apply
|
||||
if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
|
||||
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
||||
|
||||
ggml_tensor * dest_t = model_tensors[base_name];
|
||||
ggml_tensor * base_t;
|
||||
if (model_loader) {
|
||||
// load from base model
|
||||
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
|
||||
fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
||||
return 1;
|
||||
}
|
||||
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
||||
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
||||
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
||||
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
||||
model_loader->load_data_for(lt);
|
||||
lt.ggml_tensor->data = lt.data;
|
||||
}
|
||||
else {
|
||||
base_t = dest_t;
|
||||
}
|
||||
|
||||
if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1 || base_t->type == GGML_TYPE_Q4_2) {
|
||||
if (!warned) {
|
||||
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
||||
"use a f16 or f32 base model with --lora-base\n", __func__);
|
||||
warned = true;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
||||
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
||||
|
||||
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
||||
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
||||
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// w = w + BA*s
|
||||
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
||||
|
||||
if (scaling != 1.0f) {
|
||||
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
||||
BA = ggml_scale(lora_ctx, BA, scale_tensor);
|
||||
}
|
||||
|
||||
ggml_tensor * r;
|
||||
if (base_t == dest_t) {
|
||||
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
||||
}
|
||||
else {
|
||||
r = ggml_add(lora_ctx, base_t, BA);
|
||||
r = ggml_cpy(lora_ctx, r, dest_t);
|
||||
}
|
||||
|
||||
struct ggml_cgraph gf = ggml_build_forward(r);
|
||||
gf.n_threads = n_threads;
|
||||
ggml_graph_compute(lora_ctx, &gf);
|
||||
|
||||
// we won't need these tensors again, reset the context to save memory
|
||||
ggml_free(lora_ctx);
|
||||
lora_ctx = ggml_init(params);
|
||||
lora_tensors.clear();
|
||||
|
||||
n_tensors++;
|
||||
if (n_tensors % 4 == 0)
|
||||
fprintf(stderr, ".");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: this should be in a destructor, it will leak on failure
|
||||
ggml_free(lora_ctx);
|
||||
if (base_ctx) {
|
||||
ggml_free(base_ctx);
|
||||
}
|
||||
|
||||
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
||||
fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||
try {
|
||||
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
||||
} catch (const std::string & err) {
|
||||
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the KV cache that will contain the context for the
|
||||
// ongoing prediction with the model.
|
||||
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
||||
@@ -1924,18 +2190,20 @@ const char * llama_print_system_info(void) {
|
||||
static std::string s;
|
||||
|
||||
s = "";
|
||||
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
||||
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
||||
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
||||
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
||||
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
||||
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
||||
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
||||
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
||||
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
||||
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
||||
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
||||
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
||||
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
||||
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
||||
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||
|
||||
return s.c_str();
|
||||
}
|
||||
|
||||
13
llama.h
13
llama.h
@@ -72,6 +72,7 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
@@ -96,6 +97,18 @@ extern "C" {
|
||||
const char * fname_out,
|
||||
enum llama_ftype ftype);
|
||||
|
||||
// Apply a LoRA adapter to a loaded model
|
||||
// path_base_model is the path to a higher quality model to use as a base for
|
||||
// the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||
// will be applied on top of the previous one
|
||||
// Returns 0 on success
|
||||
LLAMA_API int llama_apply_lora_from_file(
|
||||
struct llama_context * ctx,
|
||||
const char * path_lora,
|
||||
const char * path_base_model,
|
||||
int n_threads);
|
||||
|
||||
// Returns the KV cache that will contain the context for the
|
||||
// ongoing prediction with the model.
|
||||
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
||||
|
||||
38
llama_util.h
38
llama_util.h
@@ -43,8 +43,12 @@
|
||||
} while (0)
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
__attribute__((format(gnu_printf, 1, 2)))
|
||||
#else
|
||||
__attribute__((format(printf, 1, 2)))
|
||||
#endif
|
||||
#endif
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap, ap2;
|
||||
va_start(ap, fmt);
|
||||
@@ -57,7 +61,7 @@ static std::string format(const char * fmt, ...) {
|
||||
va_end(ap2);
|
||||
va_end(ap);
|
||||
return std::string(buf.data(), size);
|
||||
};
|
||||
}
|
||||
|
||||
struct llama_file {
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
@@ -164,7 +168,7 @@ struct llama_mmap {
|
||||
#ifdef _POSIX_MAPPED_FILES
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file) {
|
||||
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
||||
size = file->size;
|
||||
int fd = fileno(file->fp);
|
||||
int flags = MAP_SHARED;
|
||||
@@ -172,15 +176,16 @@ struct llama_mmap {
|
||||
flags |= MAP_POPULATE;
|
||||
#endif
|
||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
||||
close(fd);
|
||||
if (addr == MAP_FAILED) {
|
||||
throw format("mmap failed: %s", strerror(errno));
|
||||
}
|
||||
|
||||
// Advise the kernel to preload the mapped memory
|
||||
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
||||
strerror(errno));
|
||||
if (prefetch) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -190,14 +195,13 @@ struct llama_mmap {
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file) {
|
||||
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
||||
size = file->size;
|
||||
|
||||
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
||||
|
||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
DWORD error = GetLastError();
|
||||
CloseHandle(hFile);
|
||||
|
||||
if (hMapping == NULL) {
|
||||
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
||||
@@ -212,13 +216,15 @@ struct llama_mmap {
|
||||
}
|
||||
|
||||
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||
// Advise the kernel to preload the mapped memory
|
||||
WIN32_MEMORY_RANGE_ENTRY range;
|
||||
range.VirtualAddress = addr;
|
||||
range.NumberOfBytes = (SIZE_T)size;
|
||||
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
if (prefetch) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
WIN32_MEMORY_RANGE_ENTRY range;
|
||||
range.VirtualAddress = addr;
|
||||
range.NumberOfBytes = (SIZE_T)size;
|
||||
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
||||
|
||||
12
pocs/CMakeLists.txt
Normal file
12
pocs/CMakeLists.txt
Normal file
@@ -0,0 +1,12 @@
|
||||
# dependencies
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
# third-party
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(vdot)
|
||||
endif()
|
||||
4
pocs/vdot/CMakeLists.txt
Normal file
4
pocs/vdot/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
set(TARGET vdot)
|
||||
add_executable(${TARGET} vdot.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
305
pocs/vdot/vdot.cpp
Normal file
305
pocs/vdot/vdot.cpp
Normal file
@@ -0,0 +1,305 @@
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <chrono>
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <array>
|
||||
|
||||
#include <ggml.h>
|
||||
|
||||
constexpr int kVecSize = 1 << 18;
|
||||
|
||||
float drawFromGaussianPdf(std::mt19937& rndm) {
|
||||
constexpr double kScale = 1./(1. + std::mt19937::max());
|
||||
constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale;
|
||||
static float lastX;
|
||||
static bool haveX = false;
|
||||
if (haveX) { haveX = false; return lastX; }
|
||||
auto r = sqrt(-2*log(1 - kScale*rndm()));
|
||||
auto phi = kTwoPiTimesScale * rndm();
|
||||
lastX = r*sin(phi);
|
||||
haveX = true;
|
||||
return r*cos(phi);
|
||||
}
|
||||
void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
|
||||
for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
|
||||
}
|
||||
|
||||
// Copy-pasted from ggml.c
|
||||
#define QK4_0 32
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||
} block_q4_0;
|
||||
static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
||||
|
||||
#define QK4_1 32
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
float m; // min
|
||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||
} block_q4_1;
|
||||
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||
|
||||
// Copy-pasted from ggml.c
|
||||
#define QK8_0 32
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
int8_t qs[QK8_0]; // quants
|
||||
} block_q8_0;
|
||||
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
|
||||
|
||||
// "Scalar" dot product between the quantized vector x and float vector y
|
||||
inline double dot(int n, const block_q4_0* x, const float* y) {
|
||||
const static float kValues[16] = {-8.f, -7.f, -6.f, -5.f, -4.f, -3.f, -2.f, -1.f, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
|
||||
constexpr uint32_t kMask1 = 0x0f0f0f0f;
|
||||
uint32_t u1, u2;
|
||||
auto q1 = (const uint8_t*)&u1;
|
||||
auto q2 = (const uint8_t*)&u2;
|
||||
double sum = 0;
|
||||
for (int i=0; i<n; ++i) {
|
||||
float d = x->d;
|
||||
auto u = (const uint32_t*)x->qs;
|
||||
float s = 0;
|
||||
for (int k=0; k<4; ++k) {
|
||||
u1 = u[k] & kMask1;
|
||||
u2 = (u[k] >> 4) & kMask1;
|
||||
s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
|
||||
y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
|
||||
y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
|
||||
y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
|
||||
y += 8;
|
||||
}
|
||||
sum += s*d;
|
||||
++x;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
// Alternative version of the above. Faster on my Mac (~45 us vs ~55 us per dot product),
|
||||
// but about the same on X86_64 (Ryzen 7950X CPU).
|
||||
inline double dot3(int n, const block_q4_0* x, const float* y) {
|
||||
const static std::pair<float,float> kValues[256] = {
|
||||
{-8.f, -8.f}, {-7.f, -8.f}, {-6.f, -8.f}, {-5.f, -8.f}, {-4.f, -8.f}, {-3.f, -8.f}, {-2.f, -8.f}, {-1.f, -8.f},
|
||||
{ 0.f, -8.f}, { 1.f, -8.f}, { 2.f, -8.f}, { 3.f, -8.f}, { 4.f, -8.f}, { 5.f, -8.f}, { 6.f, -8.f}, { 7.f, -8.f},
|
||||
{-8.f, -7.f}, {-7.f, -7.f}, {-6.f, -7.f}, {-5.f, -7.f}, {-4.f, -7.f}, {-3.f, -7.f}, {-2.f, -7.f}, {-1.f, -7.f},
|
||||
{ 0.f, -7.f}, { 1.f, -7.f}, { 2.f, -7.f}, { 3.f, -7.f}, { 4.f, -7.f}, { 5.f, -7.f}, { 6.f, -7.f}, { 7.f, -7.f},
|
||||
{-8.f, -6.f}, {-7.f, -6.f}, {-6.f, -6.f}, {-5.f, -6.f}, {-4.f, -6.f}, {-3.f, -6.f}, {-2.f, -6.f}, {-1.f, -6.f},
|
||||
{ 0.f, -6.f}, { 1.f, -6.f}, { 2.f, -6.f}, { 3.f, -6.f}, { 4.f, -6.f}, { 5.f, -6.f}, { 6.f, -6.f}, { 7.f, -6.f},
|
||||
{-8.f, -5.f}, {-7.f, -5.f}, {-6.f, -5.f}, {-5.f, -5.f}, {-4.f, -5.f}, {-3.f, -5.f}, {-2.f, -5.f}, {-1.f, -5.f},
|
||||
{ 0.f, -5.f}, { 1.f, -5.f}, { 2.f, -5.f}, { 3.f, -5.f}, { 4.f, -5.f}, { 5.f, -5.f}, { 6.f, -5.f}, { 7.f, -5.f},
|
||||
{-8.f, -4.f}, {-7.f, -4.f}, {-6.f, -4.f}, {-5.f, -4.f}, {-4.f, -4.f}, {-3.f, -4.f}, {-2.f, -4.f}, {-1.f, -4.f},
|
||||
{ 0.f, -4.f}, { 1.f, -4.f}, { 2.f, -4.f}, { 3.f, -4.f}, { 4.f, -4.f}, { 5.f, -4.f}, { 6.f, -4.f}, { 7.f, -4.f},
|
||||
{-8.f, -3.f}, {-7.f, -3.f}, {-6.f, -3.f}, {-5.f, -3.f}, {-4.f, -3.f}, {-3.f, -3.f}, {-2.f, -3.f}, {-1.f, -3.f},
|
||||
{ 0.f, -3.f}, { 1.f, -3.f}, { 2.f, -3.f}, { 3.f, -3.f}, { 4.f, -3.f}, { 5.f, -3.f}, { 6.f, -3.f}, { 7.f, -3.f},
|
||||
{-8.f, -2.f}, {-7.f, -2.f}, {-6.f, -2.f}, {-5.f, -2.f}, {-4.f, -2.f}, {-3.f, -2.f}, {-2.f, -2.f}, {-1.f, -2.f},
|
||||
{ 0.f, -2.f}, { 1.f, -2.f}, { 2.f, -2.f}, { 3.f, -2.f}, { 4.f, -2.f}, { 5.f, -2.f}, { 6.f, -2.f}, { 7.f, -2.f},
|
||||
{-8.f, -1.f}, {-7.f, -1.f}, {-6.f, -1.f}, {-5.f, -1.f}, {-4.f, -1.f}, {-3.f, -1.f}, {-2.f, -1.f}, {-1.f, -1.f},
|
||||
{ 0.f, -1.f}, { 1.f, -1.f}, { 2.f, -1.f}, { 3.f, -1.f}, { 4.f, -1.f}, { 5.f, -1.f}, { 6.f, -1.f}, { 7.f, -1.f},
|
||||
{-8.f, 0.f}, {-7.f, 0.f}, {-6.f, 0.f}, {-5.f, 0.f}, {-4.f, 0.f}, {-3.f, 0.f}, {-2.f, 0.f}, {-1.f, 0.f},
|
||||
{ 0.f, 0.f}, { 1.f, 0.f}, { 2.f, 0.f}, { 3.f, 0.f}, { 4.f, 0.f}, { 5.f, 0.f}, { 6.f, 0.f}, { 7.f, 0.f},
|
||||
{-8.f, 1.f}, {-7.f, 1.f}, {-6.f, 1.f}, {-5.f, 1.f}, {-4.f, 1.f}, {-3.f, 1.f}, {-2.f, 1.f}, {-1.f, 1.f},
|
||||
{ 0.f, 1.f}, { 1.f, 1.f}, { 2.f, 1.f}, { 3.f, 1.f}, { 4.f, 1.f}, { 5.f, 1.f}, { 6.f, 1.f}, { 7.f, 1.f},
|
||||
{-8.f, 2.f}, {-7.f, 2.f}, {-6.f, 2.f}, {-5.f, 2.f}, {-4.f, 2.f}, {-3.f, 2.f}, {-2.f, 2.f}, {-1.f, 2.f},
|
||||
{ 0.f, 2.f}, { 1.f, 2.f}, { 2.f, 2.f}, { 3.f, 2.f}, { 4.f, 2.f}, { 5.f, 2.f}, { 6.f, 2.f}, { 7.f, 2.f},
|
||||
{-8.f, 3.f}, {-7.f, 3.f}, {-6.f, 3.f}, {-5.f, 3.f}, {-4.f, 3.f}, {-3.f, 3.f}, {-2.f, 3.f}, {-1.f, 3.f},
|
||||
{ 0.f, 3.f}, { 1.f, 3.f}, { 2.f, 3.f}, { 3.f, 3.f}, { 4.f, 3.f}, { 5.f, 3.f}, { 6.f, 3.f}, { 7.f, 3.f},
|
||||
{-8.f, 4.f}, {-7.f, 4.f}, {-6.f, 4.f}, {-5.f, 4.f}, {-4.f, 4.f}, {-3.f, 4.f}, {-2.f, 4.f}, {-1.f, 4.f},
|
||||
{ 0.f, 4.f}, { 1.f, 4.f}, { 2.f, 4.f}, { 3.f, 4.f}, { 4.f, 4.f}, { 5.f, 4.f}, { 6.f, 4.f}, { 7.f, 4.f},
|
||||
{-8.f, 5.f}, {-7.f, 5.f}, {-6.f, 5.f}, {-5.f, 5.f}, {-4.f, 5.f}, {-3.f, 5.f}, {-2.f, 5.f}, {-1.f, 5.f},
|
||||
{ 0.f, 5.f}, { 1.f, 5.f}, { 2.f, 5.f}, { 3.f, 5.f}, { 4.f, 5.f}, { 5.f, 5.f}, { 6.f, 5.f}, { 7.f, 5.f},
|
||||
{-8.f, 6.f}, {-7.f, 6.f}, {-6.f, 6.f}, {-5.f, 6.f}, {-4.f, 6.f}, {-3.f, 6.f}, {-2.f, 6.f}, {-1.f, 6.f},
|
||||
{ 0.f, 6.f}, { 1.f, 6.f}, { 2.f, 6.f}, { 3.f, 6.f}, { 4.f, 6.f}, { 5.f, 6.f}, { 6.f, 6.f}, { 7.f, 6.f},
|
||||
{-8.f, 7.f}, {-7.f, 7.f}, {-6.f, 7.f}, {-5.f, 7.f}, {-4.f, 7.f}, {-3.f, 7.f}, {-2.f, 7.f}, {-1.f, 7.f},
|
||||
{ 0.f, 7.f}, { 1.f, 7.f}, { 2.f, 7.f}, { 3.f, 7.f}, { 4.f, 7.f}, { 5.f, 7.f}, { 6.f, 7.f}, { 7.f, 7.f}
|
||||
};
|
||||
double sum = 0;
|
||||
for (int i=0; i<n; ++i) {
|
||||
float d = x->d;
|
||||
auto q = x->qs;
|
||||
float s = 0;
|
||||
for (int k=0; k<4; ++k) {
|
||||
s += y[0]*kValues[q[0]].first + y[1]*kValues[q[0]].second +
|
||||
y[2]*kValues[q[1]].first + y[3]*kValues[q[1]].second +
|
||||
y[4]*kValues[q[2]].first + y[5]*kValues[q[2]].second +
|
||||
y[6]*kValues[q[3]].first + y[7]*kValues[q[3]].second;
|
||||
y += 8; q += 4;
|
||||
}
|
||||
sum += s*d;
|
||||
++x;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
inline double dot41(int n, const block_q4_1* x, const float* y) {
|
||||
const static float kValues[16] = {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f};
|
||||
constexpr uint32_t kMask1 = 0x0f0f0f0f;
|
||||
uint32_t u1, u2;
|
||||
auto q1 = (const uint8_t*)&u1;
|
||||
auto q2 = (const uint8_t*)&u2;
|
||||
double sum = 0;
|
||||
for (int i=0; i<n; ++i) {
|
||||
auto u = (const uint32_t*)x->qs;
|
||||
float s = 0, s1 = 0;
|
||||
for (int k=0; k<4; ++k) {
|
||||
u1 = u[k] & kMask1;
|
||||
u2 = (u[k] >> 4) & kMask1;
|
||||
s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
|
||||
y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
|
||||
y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
|
||||
y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
|
||||
s1 += y[0] + y[1] + y[2] + y[3] + y[4] + y[5] + y[6] + y[7];
|
||||
y += 8;
|
||||
}
|
||||
sum += s*x->d + s1*x->m;
|
||||
++x;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
// Copy-pasted from ggml.c
|
||||
static void quantize_row_q8_0_reference(const float *x, block_q8_0 *y, int k) {
|
||||
assert(k % QK8_0 == 0);
|
||||
const int nb = k / QK8_0;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int l = 0; l < QK8_0; l++) {
|
||||
const float v = x[i*QK8_0 + l];
|
||||
amax = std::max(amax, fabsf(v));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = d;
|
||||
|
||||
for (int l = 0; l < QK8_0; ++l) {
|
||||
const float v = x[i*QK8_0 + l]*id;
|
||||
y[i].qs[l] = roundf(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Copy-pasted from ggml.c
|
||||
static void dot_q4_q8(const int n, float* s, const void* vx, const void* vy) {
|
||||
const int nb = n / QK8_0;
|
||||
const block_q4_0* x = (const block_q4_0*)vx;
|
||||
const block_q8_0* y = (const block_q8_0*)vy;
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d0 = x[i].d;
|
||||
const float d1 = y[i].d;
|
||||
|
||||
const uint8_t * p0 = x[i].qs;
|
||||
const int8_t * p1 = y[i].qs;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK8_0/2; j++) {
|
||||
const uint8_t v0 = p0[j];
|
||||
|
||||
const int i0 = (int8_t) (v0 & 0xf) - 8;
|
||||
const int i1 = (int8_t) (v0 >> 4) - 8;
|
||||
|
||||
const int i2 = p1[2*j + 0];
|
||||
const int i3 = p1[2*j + 1];
|
||||
|
||||
sumi += i0*i2 + i1*i3;
|
||||
}
|
||||
sumf += d0*d1*sumi;
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
int nloop = argc > 1 ? atoi(argv[1]) : 10;
|
||||
bool scalar = argc > 2 ? atoi(argv[2]) : false;
|
||||
bool useQ4_1 = argc > 3 ? atoi(argv[3]) : false;
|
||||
|
||||
if (scalar && useQ4_1) {
|
||||
printf("It is not possible to use Q4_1 quantization and scalar implementations\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::mt19937 rndm(1234);
|
||||
|
||||
std::vector<float> x1(kVecSize), y1(kVecSize);
|
||||
int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
|
||||
int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
|
||||
|
||||
auto funcs = useQ4_1 ? ggml_internal_get_quantize_fn(GGML_TYPE_Q4_1) : ggml_internal_get_quantize_fn(GGML_TYPE_Q4_0);
|
||||
|
||||
std::vector<block_q4_0> q40;
|
||||
std::vector<block_q4_1> q41;
|
||||
if (useQ4_1) q41.resize(n4);
|
||||
else q40.resize(n4);
|
||||
std::vector<block_q8_0> q8(n8);
|
||||
std::vector<int64_t> H(16, 0);
|
||||
double sumt = 0, sumt2 = 0, maxt = 0;
|
||||
double sumqt = 0, sumqt2 = 0, maxqt = 0;
|
||||
double sum = 0, sumq = 0, exactSum = 0;
|
||||
for (int iloop=0; iloop<nloop; ++iloop) {
|
||||
|
||||
// Fill vector x with random numbers
|
||||
fillRandomGaussianFloats(x1, rndm);
|
||||
|
||||
// Fill vector y with random numbers
|
||||
fillRandomGaussianFloats(y1, rndm);
|
||||
|
||||
// Compute the exact dot product
|
||||
for (int k=0; k<kVecSize; ++k) exactSum += x1[k]*y1[k];
|
||||
|
||||
// quantize x.
|
||||
// Note, we do not include this in the timing as in practical application
|
||||
// we already have the quantized model weights.
|
||||
if (useQ4_1) {
|
||||
funcs.quantize_row_q(x1.data(), q41.data(), kVecSize);
|
||||
} else {
|
||||
funcs.quantize_row_q(x1.data(), q40.data(), kVecSize);
|
||||
}
|
||||
|
||||
// Now measure time the dot product needs using the "scalar" version above
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
if (useQ4_1) sum += dot41(kVecSize / QK4_1, q41.data(), y1.data());
|
||||
else sum += dot(kVecSize / QK4_0, q40.data(), y1.data());
|
||||
auto t2 = std::chrono::high_resolution_clock::now();
|
||||
auto t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
||||
sumt += t; sumt2 += t*t; maxt = std::max(maxt, t);
|
||||
|
||||
// And now measure the time needed to quantize y and perform the dot product with the quantized y
|
||||
t1 = std::chrono::high_resolution_clock::now();
|
||||
float result;
|
||||
if (scalar) {
|
||||
quantize_row_q8_0_reference(y1.data(), q8.data(), kVecSize);
|
||||
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
|
||||
}
|
||||
else {
|
||||
funcs.quantize_row_q_dot(y1.data(), q8.data(), kVecSize);
|
||||
if (useQ4_1) funcs.vec_dot_q(kVecSize, &result, q41.data(), q8.data());
|
||||
else funcs.vec_dot_q(kVecSize, &result, q40.data(), q8.data());
|
||||
}
|
||||
sumq += result;
|
||||
t2 = std::chrono::high_resolution_clock::now();
|
||||
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
||||
sumqt += t; sumqt2 += t*t; maxqt = std::max(maxqt, t);
|
||||
|
||||
}
|
||||
|
||||
// Report the time (and the average of the dot products so the compiler does not come up with the idea
|
||||
// of optimizing away the function calls after figuring that the result is not used).
|
||||
sum /= nloop; sumq /= nloop;
|
||||
exactSum /= nloop;
|
||||
printf("Exact result: <dot> = %g\n",exactSum);
|
||||
printf("<dot> = %g, %g\n",sum,sumq);
|
||||
sumt /= nloop; sumt2 /= nloop; sumt2 -= sumt*sumt;
|
||||
if (sumt2 > 0) sumt2 = sqrt(sumt2);
|
||||
printf("time = %g +/- %g us. maxt = %g us\n",sumt,sumt2,maxt);
|
||||
sumqt /= nloop; sumqt2 /= nloop; sumqt2 -= sumqt*sumqt;
|
||||
if (sumqt2 > 0) sumqt2 = sqrt(sumqt2);
|
||||
printf("timeq = %g +/- %g us. maxt = %g us\n",sumqt,sumqt2,maxqt);
|
||||
return 0;
|
||||
}
|
||||
@@ -1,2 +1,2 @@
|
||||
numpy==1.24
|
||||
sentencepiece==0.1.97
|
||||
sentencepiece==0.1.98
|
||||
|
||||
@@ -5,13 +5,17 @@
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
static const std::map<std::string, std::vector<llama_token>> k_tests = {
|
||||
{ "Hello World", { 1, 10994, 2787, }, },
|
||||
{ " Hello World", { 1, 15043, 2787, }, },
|
||||
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
|
||||
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
||||
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
||||
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
|
||||
static const std::map<std::string, std::vector<llama_token>> & k_tests()
|
||||
{
|
||||
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||
{ "Hello World", { 1, 10994, 2787, }, },
|
||||
{ " Hello World", { 1, 15043, 2787, }, },
|
||||
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
|
||||
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
||||
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
||||
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
|
||||
};
|
||||
return _k_tests;
|
||||
};
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
@@ -47,7 +51,7 @@ int main(int argc, char **argv) {
|
||||
return 2;
|
||||
}
|
||||
|
||||
for (const auto & test_kv : k_tests) {
|
||||
for (const auto & test_kv : k_tests()) {
|
||||
std::vector<llama_token> res(test_kv.first.size());
|
||||
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true);
|
||||
res.resize(n);
|
||||
|
||||
Reference in New Issue
Block a user