Compare commits

...

13 Commits

Author SHA1 Message Date
Georgi Gerganov
acead654d2 Merge branch 'master' into fix-refact 2023-10-08 11:25:16 +03:00
Georgi Gerganov
94e502dfb7 ci : enable on obj-c changes + fix metal build (#3540) 2023-10-08 11:24:50 +03:00
Luo Tian
7d8b24932f zig : fix build by introducing train.cpp (#3539) 2023-10-08 11:24:01 +03:00
Georgi Gerganov
0f8df395ce metal : assert various kernel requirements 2023-10-08 11:04:20 +03:00
Georgi Gerganov
b0ec5218c3 metal : support MTLGPUFamily < Apple7, formatting, style (#3524)
* metal : improve decoding speed for batches of 2-16

* metal : rename kernels mul_mat_ to mul_mv_

* metal : indentations

* minor

* metal : print more GPU info + disable mul_mm for MTLGPUFamiliy < Apple7
2023-10-08 10:01:53 +03:00
Kerfuffle
63d3b06a43 llama : fix missing break in Persimmon arch case statements (#3535) 2023-10-08 08:22:17 +03:00
Kerfuffle
a16e89cec8 Fix trying to strip newline from empty prompt and cfg prompt file content (#3534) 2023-10-07 15:31:41 -06:00
M. Yusuf Sarıgöz
4d03833211 gguf.py : fix CI for publishing GGUF package (#3532)
* Fix CI for publishing GGUF package

* Bump version

* fix

* bump version

* bump version

* bump version
2023-10-07 22:14:10 +03:00
Tom C
c47066d833 py : change version of numpy requirement to 1.24.4 (#3515)
Co-authored-by: Lyjia <me@lyjia.us>
2023-10-07 12:56:15 +03:00
cebtenzzre
f1782c68de quantize : fail fast on write errors (#3521) 2023-10-07 11:41:52 +03:00
Jhen-Jie Hong
c26765a0a1 metal : support default.metallib load & reuse code for swift package (#3522)
* metal : support load default.metallib & reuse code for swift package

* metal : use SWIFT_PACKAGE def instead of define GGML_SWIFT
2023-10-07 11:40:27 +03:00
Georgi Gerganov
42833bc7a8 ggml : silu(-inf) should never happen 2023-10-07 11:30:36 +03:00
Georgi Gerganov
bdbe11719d refact : fix convert script + zero out KV cache to avoid nans 2023-10-07 11:18:04 +03:00
15 changed files with 274 additions and 261 deletions

View File

@@ -10,10 +10,10 @@ on:
push:
branches:
- master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift']
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift']
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}

View File

@@ -36,8 +36,9 @@ jobs:
poetry install
- name: Build package
run: poetry build
run: cd gguf-py && poetry build
- name: Publish package
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_API_TOKEN }}
packages-dir: gguf-py/dist

1
.gitignore vendored
View File

@@ -10,6 +10,7 @@
*.gcno
*.gcda
*.dot
*.metallib
.DS_Store
.build/
.cache/

View File

@@ -10,15 +10,18 @@ let platforms: [SupportedPlatform]? = [
.tvOS(.v14)
]
let exclude: [String] = []
let additionalSources: [String] = ["ggml-metal.m", "ggml-metal.metal"]
let resources: [Resource] = [
.process("ggml-metal.metal")
]
let additionalSources: [String] = ["ggml-metal.m"]
let additionalSettings: [CSetting] = [
.unsafeFlags(["-fno-objc-arc"]),
.define("GGML_SWIFT"),
.define("GGML_USE_METAL")
]
#else
let platforms: [SupportedPlatform]? = nil
let exclude: [String] = ["ggml-metal.metal"]
let resources: [Resource] = []
let additionalSources: [String] = []
let additionalSettings: [CSetting] = []
#endif
@@ -40,6 +43,7 @@ let package = Package(
"ggml-alloc.c",
"k_quants.c",
] + additionalSources,
resources: resources,
publicHeadersPath: "spm-headers",
cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32"]),

View File

@@ -111,12 +111,14 @@ pub fn build(b: *std.build.Builder) !void {
const common = make.obj("common", "common/common.cpp");
const console = make.obj("common", "common/console.cpp");
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
const train = make.obj("train", "common/train.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, llama, common, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common, train });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
if (server.target.isWindows()) {

View File

@@ -170,7 +170,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
// store the external file name in params
params.prompt_file = argv[i];
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
if (params.prompt.back() == '\n') {
if (!params.prompt.empty() && params.prompt.back() == '\n') {
params.prompt.pop_back();
}
} else if (arg == "-n" || arg == "--n-predict") {
@@ -295,7 +295,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
if (params.cfg_negative_prompt.back() == '\n') {
if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') {
params.cfg_negative_prompt.pop_back();
}
} else if (arg == "--cfg-scale") {

View File

@@ -17,33 +17,6 @@ if "NO_LOCAL_GGUF" not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
import gguf
def bytes_to_unicode():
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = (
list(range(ord("!"), ord("~") + 1))
+ list(range(ord("¡"), ord("¬") + 1))
+ list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
return dict(zip(bs, (chr(n) for n in cs)))
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
@@ -153,53 +126,25 @@ tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []
tokenizer_json_file = dir_model / "tokenizer.json"
if not tokenizer_json_file.is_file():
print(f"Error: Missing {tokenizer_json_file}", file=sys.stderr)
sys.exit(1)
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
print("gguf: get gpt2 tokenizer vocab")
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = (
hparams["vocab_size"]
if "vocab_size" in hparams
else len(tokenizer_json["model"]["vocab"])
)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
for i in range(vocab_size):
if i in reverse_vocab:
text = reverse_vocab[i]
try:
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode("utf-8"))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
tokens.append(text)
scores.append(0.0) # dymmy
toktypes.append(gguf.TokenType.NORMAL) # dummy
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
scores.append(0.0) # dummy
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)

View File

@@ -167,7 +167,7 @@ int main(int argc, char ** argv) {
// the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
// users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
llama_batch batch = llama_batch_init(params.n_ctx, 0);
llama_batch batch = llama_batch_init(n_ctx, 0);
int32_t n_total_prompt = 0;
int32_t n_total_gen = 0;

View File

@@ -81,18 +81,18 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
GGML_METAL_DECL_KERNEL(rms_norm);
GGML_METAL_DECL_KERNEL(norm);
GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row);
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
@@ -185,56 +185,44 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
#ifdef GGML_SWIFT
// load the default.metallib file
// load library
{
NSError * error = nil;
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
NSURL * libURL = [NSURL fileURLWithPath:libPath];
// Load the metallib file into a Metal library
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
}
}
NSBundle * bundle = nil;
#ifdef SWIFT_PACKAGE
bundle = SWIFTPM_MODULE_BUNDLE;
#else
UNUSED(msl_library_source);
// read the source from "ggml-metal.metal" into a string and use newLibraryWithSource
{
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
#endif
NSError * error = nil;
NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
if (libPath != nil) {
NSURL * libURL = [NSURL fileURLWithPath:libPath];
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
} else {
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]);
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
}
NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
}
MTLCompileOptions* options = nil;
#ifdef GGML_QKK_64
MTLCompileOptions* options = [MTLCompileOptions new];
options.preprocessorMacros = @{ @"QK_K" : @(64) };
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
#else
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
options = [MTLCompileOptions new];
options.preprocessorMacros = @{ @"QK_K" : @(64) };
#endif
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
}
if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
}
}
#endif
// load kernels
{
@@ -274,28 +262,30 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
GGML_METAL_ADD_KERNEL(rms_norm);
GGML_METAL_ADD_KERNEL(norm);
GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row);
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
}
GGML_METAL_ADD_KERNEL(rope_f32);
GGML_METAL_ADD_KERNEL(rope_f16);
GGML_METAL_ADD_KERNEL(alibi_f32);
@@ -308,8 +298,21 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
#undef GGML_METAL_ADD_KERNEL
}
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
#if TARGET_OS_OSX
// print MTL GPU family:
GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);
// determine max supported GPU family
// https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
// https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
if ([ctx->device supportsFamily:i]) {
GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
break;
}
}
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
if (ctx->device.maxTransferRate != 0) {
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
@@ -351,28 +354,30 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
GGML_METAL_DEL_KERNEL(rms_norm);
GGML_METAL_DEL_KERNEL(norm);
GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32);
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row);
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
}
GGML_METAL_DEL_KERNEL(rope_f32);
GGML_METAL_DEL_KERNEL(rope_f16);
GGML_METAL_DEL_KERNEL(alibi_f32);
@@ -437,7 +442,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
for (int i = 0; i < ctx->n_buffers; ++i) {
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
//metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
//GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
*offs = (size_t) ioffs;
@@ -774,8 +779,8 @@ void ggml_metal_graph_compute(
} break;
case GGML_OP_CONCAT:
{
const int64_t nb = ne00;
int64_t nb = ne00;
[encoder setComputePipelineState:ctx->pipeline_concat];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
@@ -807,6 +812,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
const int nth = MIN(1024, ne0);
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ADD:
@@ -904,9 +910,10 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
const int64_t n = ggml_nelements(dst)/4;
const int64_t n = ggml_nelements(dst);
GGML_ASSERT(n % 4 == 0);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(gf->nodes[i])) {
@@ -916,9 +923,10 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst)/4;
const int64_t n = ggml_nelements(dst);
GGML_ASSERT(n % 4 == 0);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_UNARY_OP_RELU:
{
@@ -936,9 +944,10 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst)/4;
const int64_t n = ggml_nelements(dst);
GGML_ASSERT(n % 4 == 0);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
default:
{
@@ -998,21 +1007,46 @@ void ggml_metal_graph_compute(
} break;
case GGML_OP_MUL_MAT:
{
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
GGML_ASSERT(ne00 == ne10);
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
uint gqa = ne12/ne02;
GGML_ASSERT(ne03 == ne13);
const uint gqa = ne12/ne02;
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
// to the matrix-vector kernel
int ne11_mm_min = 1;
#if 0
// the numbers below are measured on M2 Ultra for 7B and 13B models
// these numbers do not translate to other devices or model sizes
// TODO: need to find a better approach
if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
switch (src0t) {
case GGML_TYPE_F16: ne11_mm_min = 2; break;
case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
case GGML_TYPE_Q5_0: // not tested yet
case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
default: ne11_mm_min = 1; break;
}
}
#endif
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
if (!ggml_is_transposed(src0) &&
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
!ggml_is_transposed(src0) &&
!ggml_is_transposed(src1) &&
src1t == GGML_TYPE_F32 &&
[ctx->device supportsFamily:MTLGPUFamilyApple7] &&
ne00%32 == 0 &&
ne11 > 2) {
ne00 % 32 == 0 &&
ne11 > ne11_mm_min) {
//printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
switch (src0->type) {
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
@@ -1041,17 +1075,18 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12];
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:13];
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
} else {
int nth0 = 32;
int nth1 = 1;
int nrows = 1;
//printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
// use custom matrix x vector kernel
switch (src0t) {
case GGML_TYPE_F32:
{
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32];
nrows = 4;
} break;
case GGML_TYPE_F16:
@@ -1059,12 +1094,12 @@ void ggml_metal_graph_compute(
nth0 = 32;
nth1 = 1;
if (ne11 * ne12 < 4) {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
nrows = ne11;
} else {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
nrows = 4;
}
} break;
@@ -1075,7 +1110,7 @@ void ggml_metal_graph_compute(
nth0 = 8;
nth1 = 8;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32];
} break;
case GGML_TYPE_Q4_1:
{
@@ -1084,7 +1119,7 @@ void ggml_metal_graph_compute(
nth0 = 8;
nth1 = 8;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
} break;
case GGML_TYPE_Q8_0:
{
@@ -1093,7 +1128,7 @@ void ggml_metal_graph_compute(
nth0 = 8;
nth1 = 8;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q8_0_f32];
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32];
} break;
case GGML_TYPE_Q2_K:
{
@@ -1102,7 +1137,7 @@ void ggml_metal_graph_compute(
nth0 = 2;
nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32];
} break;
case GGML_TYPE_Q3_K:
{
@@ -1111,7 +1146,7 @@ void ggml_metal_graph_compute(
nth0 = 2;
nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32];
} break;
case GGML_TYPE_Q4_K:
{
@@ -1120,7 +1155,7 @@ void ggml_metal_graph_compute(
nth0 = 4; //1;
nth1 = 8; //32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32];
} break;
case GGML_TYPE_Q5_K:
{
@@ -1129,7 +1164,7 @@ void ggml_metal_graph_compute(
nth0 = 2;
nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32];
} break;
case GGML_TYPE_Q6_K:
{
@@ -1138,7 +1173,7 @@ void ggml_metal_graph_compute(
nth0 = 2;
nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32];
} break;
default:
{
@@ -1167,7 +1202,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q4_K) {
@@ -1220,6 +1255,8 @@ void ggml_metal_graph_compute(
} break;
case GGML_OP_RMS_NORM:
{
GGML_ASSERT(ne00 % 4 == 0);
float eps;
memcpy(&eps, dst->op_params, sizeof(float));

View File

@@ -13,8 +13,8 @@ typedef struct {
#define QK4_1 32
typedef struct {
half d; // delta
half m; // min
half d; // delta
half m; // min
uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1;
@@ -345,10 +345,11 @@ kernel void kernel_rms_norm(
uint sgitg[[simdgroup_index_in_threadgroup]],
uint tiisg[[thread_index_in_simdgroup]],
uint ntg[[threads_per_threadgroup]]) {
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
device const float * x_scalar = (device const float *) x;
float4 sumf=0;
float all_sum=0;
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
device const float * x_scalar = (device const float *) x;
float4 sumf = 0;
float all_sum = 0;
// parallel sum
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
@@ -361,6 +362,7 @@ kernel void kernel_rms_norm(
}
threadgroup_barrier(mem_flags::mem_threadgroup);
// broadcast, simd group number is ntg / 32
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
if (tpitg < i) {
@@ -368,7 +370,9 @@ kernel void kernel_rms_norm(
}
}
if (tpitg == 0) {
for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
for (int i = 4 * (ne00 / 4); i < ne00; i++) {
sum[0] += x_scalar[i];
}
sum[0] /= ne00;
}
@@ -383,7 +387,9 @@ kernel void kernel_rms_norm(
y[i00] = x[i00] * scale;
}
if (tpitg == 0) {
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
y_scalar[i00] = x_scalar[i00] * scale;
}
}
}
@@ -423,8 +429,8 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
}
// putting them in the kernel cause a significant performance penalty
#define N_DST 4 // each SIMD group works on 4 rows
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
#define N_DST 4 // each SIMD group works on 4 rows
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
//Note: This is a template, but strictly speaking it only applies to
// quantizations where the block size is 32. It also does not
@@ -435,18 +441,23 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa,
uint3 tgpig, uint tiisg, uint sgitg) {
const int nb = ne00/QK4_0;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int im = tgpig.z;
const int first_row = (r0 * nsg + sgitg) * nr;
const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
device const block_q_type * x = (device const block_q_type *) src0 + offset0;
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
float yl[16]; // src1 vector cache
float sumf[nr]={0.f};
const int ix = tiisg/2;
const int il = 8*(tiisg%2);
float yl[16]; // src1 vector cache
float sumf[nr] = {0.f};
const int ix = (tiisg/2);
const int il = (tiisg%2)*8;
device const float * yb = y + ix * QK4_0 + il;
@@ -457,6 +468,7 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
sumy += yb[i] + yb[i+1];
yl[i+0] = yb[i+ 0];
yl[i+1] = yb[i+ 1]/256.f;
sumy += yb[i+16] + yb[i+17];
yl[i+8] = yb[i+16]/16.f;
yl[i+9] = yb[i+17]/4096.f;
@@ -472,12 +484,12 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
for (int row = 0; row < nr; ++row) {
const float tot = simd_sum(sumf[row]);
if (tiisg == 0 && first_row + row < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot;
}
}
}
kernel void kernel_mul_mat_q4_0_f32(
kernel void kernel_mul_mv_q4_0_f32(
device const void * src0,
device const float * src1,
device float * dst,
@@ -490,12 +502,12 @@ kernel void kernel_mul_mat_q4_0_f32(
constant int64_t & ne1[[buffer(16)]],
constant uint & gqa[[buffer(17)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
}
kernel void kernel_mul_mat_q4_1_f32(
kernel void kernel_mul_mv_q4_1_f32(
device const void * src0,
device const float * src1,
device float * dst,
@@ -515,7 +527,7 @@ kernel void kernel_mul_mat_q4_1_f32(
#define NB_Q8_0 8
kernel void kernel_mul_mat_q8_0_f32(
kernel void kernel_mul_mv_q8_0_f32(
device const void * src0,
device const float * src1,
device float * dst,
@@ -579,7 +591,7 @@ kernel void kernel_mul_mat_q8_0_f32(
#define N_F32_F32 4
kernel void kernel_mul_mat_f32_f32(
kernel void kernel_mul_mv_f32_f32(
device const char * src0,
device const char * src1,
device float * dst,
@@ -650,7 +662,7 @@ kernel void kernel_mul_mat_f32_f32(
}
}
kernel void kernel_mul_mat_f16_f32_1row(
kernel void kernel_mul_mv_f16_f32_1row(
device const char * src0,
device const char * src1,
device float * dst,
@@ -669,7 +681,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
constant int64_t & ne0,
constant int64_t & ne1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {
uint tiisg[[thread_index_in_simdgroup]]) {
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
@@ -704,7 +716,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
#define N_F16_F32 4
kernel void kernel_mul_mat_f16_f32(
kernel void kernel_mul_mv_f16_f32(
device const char * src0,
device const char * src1,
device float * dst,
@@ -776,7 +788,7 @@ kernel void kernel_mul_mat_f16_f32(
}
// Assumes row size (ne00) is a multiple of 4
kernel void kernel_mul_mat_f16_f32_l4(
kernel void kernel_mul_mv_f16_f32_l4(
device const char * src0,
device const char * src1,
device float * dst,
@@ -1253,7 +1265,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
//====================================== dot products =========================
kernel void kernel_mul_mat_q2_K_f32(
kernel void kernel_mul_mv_q2_K_f32(
device const void * src0,
device const float * src1,
device float * dst,
@@ -1397,7 +1409,7 @@ kernel void kernel_mul_mat_q2_K_f32(
}
#if QK_K == 256
kernel void kernel_mul_mat_q3_K_f32(
kernel void kernel_mul_mv_q3_K_f32(
device const void * src0,
device const float * src1,
device float * dst,
@@ -1549,7 +1561,7 @@ kernel void kernel_mul_mat_q3_K_f32(
}
}
#else
kernel void kernel_mul_mat_q3_K_f32(
kernel void kernel_mul_mv_q3_K_f32(
device const void * src0,
device const float * src1,
device float * dst,
@@ -1620,7 +1632,7 @@ kernel void kernel_mul_mat_q3_K_f32(
#endif
#if QK_K == 256
kernel void kernel_mul_mat_q4_K_f32(
kernel void kernel_mul_mv_q4_K_f32(
device const void * src0,
device const float * src1,
device float * dst,
@@ -1726,7 +1738,7 @@ kernel void kernel_mul_mat_q4_K_f32(
}
}
#else
kernel void kernel_mul_mat_q4_K_f32(
kernel void kernel_mul_mv_q4_K_f32(
device const void * src0,
device const float * src1,
device float * dst,
@@ -1815,7 +1827,7 @@ kernel void kernel_mul_mat_q4_K_f32(
}
#endif
kernel void kernel_mul_mat_q5_K_f32(
kernel void kernel_mul_mv_q5_K_f32(
device const void * src0,
device const float * src1,
device float * dst,
@@ -1988,7 +2000,7 @@ kernel void kernel_mul_mat_q5_K_f32(
}
kernel void kernel_mul_mat_q6_K_f32(
kernel void kernel_mul_mv_q6_K_f32(
device const void * src0,
device const float * src1,
device float * dst,
@@ -2326,7 +2338,7 @@ kernel void kernel_get_rows(
}
#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
#define BLOCK_SIZE_K 32
#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
@@ -2363,9 +2375,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
const uint r0 = tgpig.y;
const uint r1 = tgpig.x;
const uint im = tgpig.z;
// if this block is of 64x32 shape or smaller
short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
// a thread shouldn't load data outside of the matrix
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
@@ -2389,26 +2403,30 @@ kernel void kernel_mul_mm(device const uchar * src0,
+ nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
//load data and store to threadgroup memory
// load data and store to threadgroup memory
half4x4 temp_a;
dequantize_func(x, il, temp_a);
threadgroup_barrier(mem_flags::mem_threadgroup);
#pragma unroll(16)
for (int i = 0; i < 16; i++) {
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
+ 16 * (tiitg % THREAD_PER_ROW) + 8 * (i / 8)) \
+ (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
+ (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
+ (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
}
*(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) \
= *((device float2x4 *)y);
*(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
il = (il + 2 < nl) ? il + 2 : il % 2;
x = (il < 2) ? x + (2+nl-1)/nl : x;
y += BLOCK_SIZE_K;
threadgroup_barrier(mem_flags::mem_threadgroup);
//load matrices from threadgroup memory and conduct outer products
// load matrices from threadgroup memory and conduct outer products
threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
#pragma unroll(4)
for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
#pragma unroll(4)
@@ -2423,6 +2441,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
#pragma unroll(8)
for (int i = 0; i < 8; i++){
simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
@@ -2431,25 +2450,26 @@ kernel void kernel_mul_mm(device const uchar * src0,
}
if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \
+ (BLOCK_SIZE_N * r1 + 16 * (sgitg>>1)) * ne0 + im*ne1*ne0;
device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) \
+ (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
for (int i = 0; i < 8; i++) {
simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
}
} else {
// block is smaller than 64x32, we should avoid writing data outside of the matrix
threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
for (int i = 0; i < 8; i++) {
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
if (sgitg==0) {
device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
if (sgitg == 0) {
for (int i = 0; i < n_rows; i++) {
for (int j = tiitg; j< n_cols; j += BLOCK_SIZE_N) {
for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
*(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
}
}

27
ggml.c
View File

@@ -11256,7 +11256,7 @@ static void ggml_compute_forward_silu_f32(
#ifndef NDEBUG
for (int k = 0; k < nc; k++) {
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
UNUSED(x);
assert(!isnan(x));
assert(!isinf(x));
@@ -13089,17 +13089,17 @@ static void ggml_compute_forward_alibi_f32(
assert(n_past >= 0);
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
const int ne1 = src0->ne[1]; // seq_len_without_past
const int ne2 = src0->ne[2]; // n_head -> this is k
//const int ne3 = src0->ne[3]; // 1 -> bsz
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
const int n = ggml_nrows(src0);
const int ne2_ne3 = n/ne1; // ne2*ne3
const int64_t n = ggml_nrows(src0);
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
const int nb0 = src0->nb[0];
const int nb1 = src0->nb[1];
const int nb2 = src0->nb[2];
const size_t nb0 = src0->nb[0];
const size_t nb1 = src0->nb[1];
const size_t nb2 = src0->nb[2];
//const int nb3 = src0->nb[3];
GGML_ASSERT(nb0 == sizeof(float));
@@ -13111,9 +13111,9 @@ static void ggml_compute_forward_alibi_f32(
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
for (int i = 0; i < ne0; i++) {
for (int j = 0; j < ne1; j++) {
for (int k = 0; k < ne2_ne3; k++) {
for (int64_t i = 0; i < ne0; i++) {
for (int64_t j = 0; j < ne1; j++) {
for (int64_t k = 0; k < ne2_ne3; k++) {
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
@@ -13128,7 +13128,6 @@ static void ggml_compute_forward_alibi_f32(
}
pdst[0] = i * m_k + src[0];
}
}
}

View File

@@ -69,4 +69,3 @@ python -m twine upload dist/*
## TODO
- [ ] Add tests
- [ ] Include conversion scripts as command line entry points in this package.
- Add CI workflow for releasing the package.

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "gguf"
version = "0.4.0"
version = "0.4.4"
description = "Write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [

View File

@@ -1325,7 +1325,11 @@ static bool llama_kv_cache_init(
cache.cells.clear();
cache.cells.resize(n_ctx);
// TODO: this should be:
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
// change it and test that it works
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
memset(cache.buf.data, 0, cache.buf.size);
struct ggml_init_params params;
params.mem_size = cache.buf.size;
@@ -2047,7 +2051,7 @@ static void llm_load_hparams(
case 36: model.type = e_model::MODEL_8B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
}
} break;
case LLM_ARCH_REFACT:
{
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
@@ -4926,7 +4930,7 @@ static struct ggml_cgraph * llama_build_graph(
case LLM_ARCH_PERSIMMON:
{
result = llm_build_persimmon(lctx, batch);
}
} break;
case LLM_ARCH_REFACT:
{
result = llm_build_refact(lctx, batch);
@@ -7194,6 +7198,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
std::ofstream fout(fname_out, std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
const size_t meta_size = gguf_get_meta_size(ctx_out);

View File

@@ -1,3 +1,3 @@
numpy==1.24
numpy==1.24.4
sentencepiece==0.1.98
gguf>=0.1.0