mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
12 Commits
b1345
...
alloc-asse
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ee7456926e | ||
|
|
fcca0a7004 | ||
|
|
dcc09d2596 | ||
|
|
db3abcc114 | ||
|
|
eee42c670e | ||
|
|
8e6716a102 | ||
|
|
9c38d181d4 | ||
|
|
a1202a31ed | ||
|
|
94e502dfb7 | ||
|
|
7d8b24932f | ||
|
|
b0ec5218c3 | ||
|
|
63d3b06a43 |
4
.github/workflows/build.yml
vendored
4
.github/workflows/build.yml
vendored
@@ -10,10 +10,10 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift']
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift']
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
||||
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
25
.github/workflows/zig-build.yml
vendored
Normal file
25
.github/workflows/zig-build.yml
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
name: Zig CI
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
jobs:
|
||||
build:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
runs-on: [ubuntu-latest, macos-latest, windows-latest]
|
||||
runs-on: ${{ matrix.runs-on }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: recursive
|
||||
fetch-depth: 0
|
||||
- uses: goto-bus-stop/setup-zig@v2
|
||||
with:
|
||||
version: 0.11.0
|
||||
- name: Build Summary
|
||||
run: zig build --summary all -freference-trace
|
||||
@@ -663,6 +663,8 @@ add_library(ggml OBJECT
|
||||
ggml.h
|
||||
ggml-alloc.c
|
||||
ggml-alloc.h
|
||||
ggml-backend.c
|
||||
ggml-backend.h
|
||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
|
||||
7
Makefile
7
Makefile
@@ -512,9 +512,12 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
|
||||
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
OBJS += ggml-alloc.o
|
||||
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
|
||||
OBJS += ggml-alloc.o ggml-backend.o
|
||||
|
||||
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common.o: common/common.cpp common/common.h build-info.h common/log.h
|
||||
|
||||
46
build.zig
46
build.zig
@@ -36,14 +36,17 @@ const Maker = struct {
|
||||
}
|
||||
|
||||
fn init(builder: *std.build.Builder) !Maker {
|
||||
// const commit_hash = @embedFile(".git/refs/heads/master");
|
||||
const target = builder.standardTargetOptions(.{});
|
||||
const zig_version = @import("builtin").zig_version_string;
|
||||
const commit_hash = try std.ChildProcess.exec(
|
||||
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
|
||||
);
|
||||
const config_header = builder.addConfigHeader(
|
||||
.{ .style = .blank, .include_path = "build-info.h" },
|
||||
.{
|
||||
.BUILD_NUMBER = 0,
|
||||
.BUILD_COMMIT = "12345", // omit newline
|
||||
.BUILD_COMPILER = "Zig 0.11.0",
|
||||
.BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline
|
||||
.BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}),
|
||||
.BUILD_TARGET = try target.allocDescription(builder.allocator),
|
||||
},
|
||||
);
|
||||
@@ -67,12 +70,20 @@ const Maker = struct {
|
||||
|
||||
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
|
||||
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
||||
if (o.target.getAbi() != .msvc)
|
||||
o.defineCMacro("_GNU_SOURCE", null);
|
||||
o.addConfigHeader(m.config_header);
|
||||
if (std.mem.endsWith(u8, src, ".c")) {
|
||||
o.addCSourceFiles(&.{src}, m.cflags.items);
|
||||
o.linkLibC();
|
||||
} else {
|
||||
o.addCSourceFiles(&.{src}, m.cxxflags.items);
|
||||
o.linkLibCpp();
|
||||
if (o.target.getAbi() == .msvc) {
|
||||
o.linkLibC(); // need winsdk + crt
|
||||
} else {
|
||||
// linkLibCpp already add (libc++ + libunwind + libc)
|
||||
o.linkLibCpp();
|
||||
}
|
||||
}
|
||||
o.addConfigHeader(m.config_header);
|
||||
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
||||
@@ -86,8 +97,14 @@ const Maker = struct {
|
||||
for (deps) |d| e.addObject(d);
|
||||
for (m.objs.items) |o| e.addObject(o);
|
||||
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
|
||||
e.linkLibC();
|
||||
e.linkLibCpp();
|
||||
|
||||
// https://github.com/ziglang/zig/issues/15448
|
||||
if (e.target.getAbi() == .msvc) {
|
||||
e.linkLibC(); // need winsdk + crt
|
||||
} else {
|
||||
// linkLibCpp already add (libc++ + libunwind + libc)
|
||||
e.linkLibCpp();
|
||||
}
|
||||
e.addConfigHeader(m.config_header);
|
||||
m.builder.installArtifact(e);
|
||||
e.want_lto = m.enable_lto;
|
||||
@@ -107,18 +124,21 @@ pub fn build(b: *std.build.Builder) !void {
|
||||
|
||||
const ggml = make.obj("ggml", "ggml.c");
|
||||
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
||||
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
||||
const llama = make.obj("llama", "llama.cpp");
|
||||
const common = make.obj("common", "common/common.cpp");
|
||||
const console = make.obj("common", "common/console.cpp");
|
||||
const console = make.obj("console", "common/console.cpp");
|
||||
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
||||
const train = make.obj("train", "common/train.cpp");
|
||||
|
||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common });
|
||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
|
||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common });
|
||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, console, grammar_parser });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
||||
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, grammar_parser });
|
||||
if (server.target.isWindows()) {
|
||||
server.linkSystemLibrary("ws2_32");
|
||||
}
|
||||
|
||||
@@ -17,33 +17,6 @@ if "NO_LOCAL_GGUF" not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
|
||||
import gguf
|
||||
|
||||
|
||||
def bytes_to_unicode():
|
||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = (
|
||||
list(range(ord("!"), ord("~") + 1))
|
||||
+ list(range(ord("¡"), ord("¬") + 1))
|
||||
+ list(range(ord("®"), ord("ÿ") + 1))
|
||||
)
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8 + n)
|
||||
n += 1
|
||||
return dict(zip(bs, (chr(n) for n in cs)))
|
||||
|
||||
|
||||
def count_model_parts(dir_model: Path) -> int:
|
||||
num_parts = 0
|
||||
for filename in os.listdir(dir_model):
|
||||
@@ -153,53 +126,25 @@ tokens: list[bytearray] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
tokenizer_json_file = dir_model / "tokenizer.json"
|
||||
if not tokenizer_json_file.is_file():
|
||||
print(f"Error: Missing {tokenizer_json_file}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# gpt2 tokenizer
|
||||
gguf_writer.add_tokenizer_model("gpt2")
|
||||
|
||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
|
||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||
vocab_size = (
|
||||
hparams["vocab_size"]
|
||||
if "vocab_size" in hparams
|
||||
else len(tokenizer_json["model"]["vocab"])
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
byte_encoder = bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i in reverse_vocab:
|
||||
text = reverse_vocab[i]
|
||||
try:
|
||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||
except KeyError:
|
||||
text = bytearray()
|
||||
for c in reverse_vocab[i]:
|
||||
if ord(c) < 256: # single byte character
|
||||
text.append(byte_decoder[ord(c)])
|
||||
else: # multibyte special token character
|
||||
text.extend(c.encode("utf-8"))
|
||||
else:
|
||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
||||
pad_token = f"[PAD{i}]".encode("utf8")
|
||||
text = bytearray(pad_token)
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(0.0) # dymmy
|
||||
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
||||
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||
scores.append(0.0) # dummy
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
|
||||
@@ -167,7 +167,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
|
||||
// users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
|
||||
llama_batch batch = llama_batch_init(params.n_ctx, 0);
|
||||
llama_batch batch = llama_batch_init(n_ctx, 0);
|
||||
|
||||
int32_t n_total_prompt = 0;
|
||||
int32_t n_total_gen = 0;
|
||||
|
||||
@@ -27,10 +27,10 @@ def is_present(json, key):
|
||||
buf = json[key]
|
||||
except KeyError:
|
||||
return False
|
||||
if json[key] == None:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
|
||||
#convert chat to prompt
|
||||
def convert_chat(messages):
|
||||
prompt = "" + args.chat_prompt.replace("\\n", "\n")
|
||||
|
||||
169
ggml-alloc.c
169
ggml-alloc.c
@@ -1,4 +1,5 @@
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
#include <assert.h>
|
||||
#include <stdarg.h>
|
||||
@@ -6,25 +7,6 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __has_include
|
||||
#if __has_include(<unistd.h>)
|
||||
#include <unistd.h>
|
||||
#if defined(_POSIX_MAPPED_FILES)
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <memoryapi.h>
|
||||
#endif
|
||||
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
@@ -80,8 +62,9 @@ struct free_block {
|
||||
#define MAX_FREE_BLOCKS 256
|
||||
|
||||
struct ggml_allocr {
|
||||
struct ggml_backend_buffer * buffer;
|
||||
bool buffer_owned;
|
||||
void * data;
|
||||
size_t size;
|
||||
size_t alignment;
|
||||
int n_free_blocks;
|
||||
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
||||
@@ -119,16 +102,9 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
||||
}
|
||||
#endif
|
||||
|
||||
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
return ggml_nbytes(tensor);
|
||||
|
||||
UNUSED(alloc);
|
||||
}
|
||||
|
||||
// check if a tensor is allocated by this buffer
|
||||
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
||||
void * ptr = tensor->data;
|
||||
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
||||
return tensor->buffer == alloc->buffer;
|
||||
}
|
||||
|
||||
static bool ggml_is_view(struct ggml_tensor * t) {
|
||||
@@ -136,11 +112,10 @@ static bool ggml_is_view(struct ggml_tensor * t) {
|
||||
}
|
||||
|
||||
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
||||
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
||||
#endif
|
||||
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
||||
|
||||
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
|
||||
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
||||
@@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
||||
|
||||
tensor->data = addr;
|
||||
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
|
||||
tensor->buffer = alloc->buffer;
|
||||
ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
add_allocated_tensor(alloc, tensor);
|
||||
@@ -208,19 +185,21 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
||||
|
||||
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
||||
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
void * ptr = tensor->data;
|
||||
|
||||
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
||||
// the tensor was not allocated in this buffer
|
||||
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
||||
// the easiest way to deal with this is just to ignore it
|
||||
AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
||||
void * ptr = tensor->data;
|
||||
|
||||
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
||||
AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
|
||||
|
||||
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
remove_allocated_tensor(alloc, tensor);
|
||||
@@ -285,15 +264,18 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
||||
alloc->n_free_blocks = 1;
|
||||
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
||||
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
|
||||
alloc->free_blocks[0].size = alloc->size - align_offset;
|
||||
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
||||
}
|
||||
|
||||
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
|
||||
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
||||
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
|
||||
|
||||
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
|
||||
|
||||
*alloc = (struct ggml_allocr){
|
||||
/*.data = */ data,
|
||||
/*.size = */ size,
|
||||
/*.buffer = */ buffer,
|
||||
/*.buffer_owned = */ true,
|
||||
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
||||
/*.alignment = */ alignment,
|
||||
/*.n_free_blocks = */ 0,
|
||||
/*.free_blocks = */ {{0}},
|
||||
@@ -312,74 +294,26 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
||||
return alloc;
|
||||
}
|
||||
|
||||
// OS specific functions to allocate and free uncommitted virtual memory
|
||||
static void * alloc_vmem(size_t size) {
|
||||
#if defined(_WIN32)
|
||||
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
|
||||
#elif defined(_POSIX_MAPPED_FILES)
|
||||
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
||||
if (ptr == MAP_FAILED) {
|
||||
return NULL;
|
||||
}
|
||||
return ptr;
|
||||
#else
|
||||
// use a fixed address for other platforms
|
||||
uintptr_t base_addr = (uintptr_t)-size - 0x100;
|
||||
return (void *)base_addr;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void free_vmem(void * base_addr, size_t size) {
|
||||
#if defined(_WIN32)
|
||||
VirtualFree(base_addr, 0, MEM_RELEASE);
|
||||
UNUSED(size);
|
||||
#elif defined(_POSIX_MAPPED_FILES)
|
||||
munmap(base_addr, size);
|
||||
#else
|
||||
// nothing to do
|
||||
UNUSED(base_addr);
|
||||
UNUSED(size);
|
||||
#endif
|
||||
}
|
||||
|
||||
// allocate uncommitted virtual memory to measure the size of the graph
|
||||
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
||||
// 128GB for 64-bit, 1GB for 32-bit
|
||||
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
|
||||
do {
|
||||
*base_addr = alloc_vmem(*size);
|
||||
if (*base_addr != NULL) {
|
||||
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
|
||||
return;
|
||||
}
|
||||
// try again with half the size
|
||||
*size /= 2;
|
||||
} while (*size > 0);
|
||||
|
||||
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
|
||||
}
|
||||
|
||||
static void free_measure_vmem(void * base_addr, size_t size) {
|
||||
free_vmem(base_addr, size);
|
||||
}
|
||||
|
||||
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
||||
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
||||
struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
|
||||
alloc->measure = true;
|
||||
|
||||
void * base_addr;
|
||||
size_t size;
|
||||
return alloc;
|
||||
}
|
||||
|
||||
alloc_measure_vmem(&base_addr, &size);
|
||||
struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
||||
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
|
||||
|
||||
*alloc = (struct ggml_allocr){
|
||||
/*.data = */ base_addr,
|
||||
/*.size = */ size,
|
||||
/*.alignment = */ alignment,
|
||||
/*.buffer = */ buffer,
|
||||
/*.buffer_owned = */ false,
|
||||
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
||||
/*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
|
||||
/*.n_free_blocks = */ 0,
|
||||
/*.free_blocks = */ {{0}},
|
||||
/*.hash_table = */ {{0}},
|
||||
/*.max_size = */ 0,
|
||||
/*.measure = */ true,
|
||||
/*.measure = */ false,
|
||||
/*.parse_seq = */ {0},
|
||||
/*.parse_seq_len = */ 0,
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
@@ -393,8 +327,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
||||
}
|
||||
|
||||
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
||||
if (alloc->measure) {
|
||||
free_measure_vmem(alloc->data, alloc->size);
|
||||
if (alloc->buffer_owned) {
|
||||
ggml_backend_buffer_free(alloc->buffer);
|
||||
}
|
||||
free(alloc);
|
||||
}
|
||||
@@ -437,7 +371,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_CONT:
|
||||
return true;
|
||||
|
||||
default:
|
||||
@@ -445,12 +378,23 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
}
|
||||
}
|
||||
|
||||
static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
|
||||
assert(view->view_src != NULL && view->view_src->data != NULL);
|
||||
view->backend = view->view_src->backend;
|
||||
view->buffer = view->view_src->buffer;
|
||||
view->data = (char *)view->view_src->data + view->view_offs;
|
||||
|
||||
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
||||
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
||||
assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
|
||||
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
||||
}
|
||||
|
||||
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
|
||||
struct hash_node * ht = alloc->hash_table;
|
||||
if (node->data == NULL) {
|
||||
if (ggml_is_view(node)) {
|
||||
assert(node->view_src->data != NULL);
|
||||
node->data = (char *)node->view_src->data + node->view_offs;
|
||||
init_view(alloc, node);
|
||||
} else {
|
||||
// see if we can reuse a parent's buffer (inplace)
|
||||
if (ggml_op_can_inplace(node->op)) {
|
||||
@@ -478,13 +422,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
||||
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
|
||||
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
|
||||
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
||||
node->data = parent->data;
|
||||
node->view_src = view_src;
|
||||
view_src_hn->n_views += 1;
|
||||
init_view(alloc, node);
|
||||
return;
|
||||
}
|
||||
}
|
||||
else {
|
||||
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
||||
node->data = parent->data;
|
||||
node->view_src = parent;
|
||||
p_hn->n_views += 1;
|
||||
init_view(alloc, node);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
||||
}
|
||||
}
|
||||
|
||||
static size_t ggml_allocr_alloc_graph_tensors_n(
|
||||
size_t ggml_allocr_alloc_graph_n(
|
||||
struct ggml_allocr * alloc,
|
||||
struct ggml_cgraph ** graphs, int n_graphs,
|
||||
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
||||
@@ -513,6 +461,10 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
||||
if (ggml_is_view(node)) {
|
||||
struct ggml_tensor * view_src = node->view_src;
|
||||
hash_get(ht, view_src)->n_views += 1;
|
||||
if (node->buffer == NULL && node->data != NULL) {
|
||||
// view of a pre-allocated tensor, didn't call init_view() yet
|
||||
init_view(alloc, node);
|
||||
}
|
||||
}
|
||||
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
@@ -521,6 +473,9 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
||||
break;
|
||||
}
|
||||
hash_get(ht, parent)->n_children += 1;
|
||||
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
||||
init_view(alloc, parent);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -631,7 +586,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
||||
}
|
||||
|
||||
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
||||
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
||||
return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
|
||||
}
|
||||
|
||||
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
|
||||
|
||||
16
ggml-alloc.h
16
ggml-alloc.h
@@ -6,21 +6,27 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct ggml_backend_buffer;
|
||||
|
||||
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
||||
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
||||
GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
||||
|
||||
// tell the allocator to parse nodes following the order described in the list
|
||||
// you should call this if your graph are optimized to execute out-of-order
|
||||
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
||||
|
||||
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
||||
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
||||
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|
||||
GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_allocr_free (struct ggml_allocr * alloc);
|
||||
GGML_API bool ggml_allocr_is_measure (struct ggml_allocr * alloc);
|
||||
GGML_API void ggml_allocr_reset (struct ggml_allocr * alloc);
|
||||
GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
||||
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
|
||||
GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc);
|
||||
|
||||
GGML_API size_t ggml_allocr_alloc_graph_n(
|
||||
struct ggml_allocr * alloc,
|
||||
struct ggml_cgraph ** graphs, int n_graphs,
|
||||
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
385
ggml-backend.c
Normal file
385
ggml-backend.c
Normal file
@@ -0,0 +1,385 @@
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-alloc.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
// backend buffer
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
struct ggml_backend * backend,
|
||||
struct ggml_backend_buffer_i iface,
|
||||
ggml_backend_buffer_context_t context,
|
||||
size_t size) {
|
||||
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
|
||||
|
||||
GGML_ASSERT(iface.get_base != NULL);
|
||||
|
||||
(*buffer) = (struct ggml_backend_buffer) {
|
||||
/* .interface = */ iface,
|
||||
/* .backend = */ backend,
|
||||
/* .context = */ context,
|
||||
/* .size = */ size,
|
||||
};
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
if (buffer->iface.free_buffer != NULL) {
|
||||
buffer->iface.free_buffer(buffer);
|
||||
}
|
||||
free(buffer);
|
||||
}
|
||||
|
||||
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
||||
return ggml_backend_get_alignment(buffer->backend);
|
||||
}
|
||||
|
||||
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_base(buffer);
|
||||
}
|
||||
|
||||
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
||||
return buffer->size;
|
||||
}
|
||||
|
||||
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
if (buffer->iface.get_alloc_size) {
|
||||
return buffer->iface.get_alloc_size(buffer, tensor);
|
||||
}
|
||||
return ggml_nbytes(tensor);
|
||||
}
|
||||
|
||||
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
if (buffer->iface.init_tensor) {
|
||||
buffer->iface.init_tensor(buffer, tensor);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
if (buffer->iface.free_tensor) {
|
||||
buffer->iface.free_tensor(buffer, tensor);
|
||||
}
|
||||
}
|
||||
|
||||
// backend
|
||||
|
||||
ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
|
||||
return tensor->buffer->backend;
|
||||
}
|
||||
|
||||
const char * ggml_backend_name(ggml_backend_t backend) {
|
||||
return backend->iface.get_name(backend);
|
||||
}
|
||||
|
||||
void ggml_backend_free(ggml_backend_t backend) {
|
||||
backend->iface.free(backend);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
||||
return backend->iface.alloc_buffer(backend, size);
|
||||
}
|
||||
|
||||
size_t ggml_backend_get_alignment(ggml_backend_t backend) {
|
||||
return backend->iface.get_alignment(backend);
|
||||
}
|
||||
|
||||
void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
||||
}
|
||||
|
||||
void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
||||
}
|
||||
|
||||
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
||||
ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
|
||||
}
|
||||
|
||||
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
||||
ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
|
||||
}
|
||||
|
||||
void ggml_backend_synchronize(ggml_backend_t backend) {
|
||||
backend->iface.synchronize(backend);
|
||||
}
|
||||
|
||||
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
return backend->iface.graph_plan_create(backend, cgraph);
|
||||
}
|
||||
|
||||
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
backend->iface.graph_plan_free(backend, plan);
|
||||
}
|
||||
|
||||
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
backend->iface.graph_plan_compute(backend, plan);
|
||||
}
|
||||
|
||||
void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
backend->iface.graph_compute(backend, cgraph);
|
||||
}
|
||||
|
||||
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
return backend->iface.supports_op(backend, op);
|
||||
}
|
||||
|
||||
// backend copy
|
||||
|
||||
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||
if (a->type != b->type) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (a->ne[i] != b->ne[i]) {
|
||||
return false;
|
||||
}
|
||||
if (a->nb[i] != b->nb[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
//printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
|
||||
//printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
|
||||
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
||||
|
||||
// printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
|
||||
|
||||
if (src == dst) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO: allow backends to support copy to/from same backend
|
||||
|
||||
if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
|
||||
ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
|
||||
} else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
|
||||
ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
|
||||
} else {
|
||||
// shouldn't be hit when copying from/to CPU
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend));
|
||||
#endif
|
||||
size_t nbytes = ggml_nbytes(src);
|
||||
void * data = malloc(nbytes);
|
||||
ggml_backend_tensor_get(src, data, 0, nbytes);
|
||||
ggml_backend_tensor_set(dst, data, 0, nbytes);
|
||||
free(data);
|
||||
}
|
||||
}
|
||||
|
||||
// backend CPU
|
||||
|
||||
struct ggml_backend_cpu_context {
|
||||
int n_threads;
|
||||
void * work_data;
|
||||
size_t work_size;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
||||
return "CPU";
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
free(cpu_ctx->work_data);
|
||||
free(cpu_ctx);
|
||||
free(backend);
|
||||
}
|
||||
|
||||
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return (void *)buffer->context;
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
free(buffer->context);
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
||||
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .init_tensor = */ NULL, // no initialization required
|
||||
/* .free_tensor = */ NULL, // no cleanup required
|
||||
};
|
||||
|
||||
// for buffers from ptr, free is not called
|
||||
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
||||
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
||||
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .init_tensor = */ NULL,
|
||||
/* .free_tensor = */ NULL,
|
||||
};
|
||||
|
||||
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
|
||||
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
||||
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
||||
|
||||
return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) {
|
||||
return TENSOR_ALIGNMENT;
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
|
||||
memcpy((char *)tensor->data + offset, data, size);
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
|
||||
memcpy(data, (const char *)tensor->data + offset, size);
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_synchronize(ggml_backend_t backend) {
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
// for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends
|
||||
ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
struct ggml_backend_plan_cpu {
|
||||
struct ggml_cplan cplan;
|
||||
struct ggml_cgraph cgraph;
|
||||
};
|
||||
|
||||
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
|
||||
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
||||
|
||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||
cpu_plan->cgraph = *cgraph;
|
||||
|
||||
if (cpu_plan->cplan.work_size > 0) {
|
||||
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
||||
}
|
||||
|
||||
return cpu_plan;
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||
|
||||
free(cpu_plan->cplan.work_data);
|
||||
free(cpu_plan);
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||
|
||||
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
|
||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||
|
||||
if (cpu_ctx->work_size < cplan.work_size) {
|
||||
// TODO: may be faster to free and use malloc to avoid the copy
|
||||
cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
|
||||
cpu_ctx->work_size = cplan.work_size;
|
||||
}
|
||||
|
||||
cplan.work_data = cpu_ctx->work_data;
|
||||
|
||||
ggml_graph_compute(cgraph, &cplan);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
return true;
|
||||
UNUSED(backend);
|
||||
UNUSED(op);
|
||||
}
|
||||
|
||||
static struct ggml_backend_i cpu_backend_i = {
|
||||
/* .get_name = */ ggml_backend_cpu_name,
|
||||
/* .free = */ ggml_backend_cpu_free,
|
||||
/* .alloc_buffer = */ ggml_backend_cpu_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_get_alignment,
|
||||
/* .set_tensor_async = */ ggml_backend_cpu_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_cpu_get_tensor_async,
|
||||
/* .synchronize = */ ggml_backend_cpu_synchronize,
|
||||
/* .cpy_tensor_from = */ ggml_backend_cpu_cpy_tensor_from,
|
||||
/* .cpy_tensor_to = */ ggml_backend_cpu_cpy_tensor_to,
|
||||
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
||||
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
||||
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
||||
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
||||
};
|
||||
|
||||
ggml_backend_t ggml_backend_cpu_init(void) {
|
||||
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
|
||||
|
||||
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||||
ctx->work_data = NULL;
|
||||
ctx->work_size = 0;
|
||||
|
||||
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
||||
|
||||
*cpu_backend = (struct ggml_backend) {
|
||||
/* .interface = */ cpu_backend_i,
|
||||
/* .context = */ ctx
|
||||
};
|
||||
return cpu_backend;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
||||
return backend->iface.get_name == ggml_backend_cpu_name;
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||
ctx->n_threads = n_threads;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
|
||||
return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
|
||||
}
|
||||
143
ggml-backend.h
Normal file
143
ggml-backend.h
Normal file
@@ -0,0 +1,143 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
struct ggml_backend;
|
||||
struct ggml_backend_buffer;
|
||||
|
||||
// type-erased backend-specific types / wrappers
|
||||
typedef void * ggml_backend_context_t;
|
||||
typedef void * ggml_backend_graph_plan_t;
|
||||
typedef void * ggml_backend_buffer_context_t;
|
||||
|
||||
// avoid accessing internals of these types
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||
|
||||
//
|
||||
// backend buffer
|
||||
//
|
||||
|
||||
struct ggml_backend_buffer_i {
|
||||
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
||||
void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
|
||||
size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
|
||||
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
|
||||
void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
|
||||
};
|
||||
|
||||
// TODO: hide behind API
|
||||
struct ggml_backend_buffer {
|
||||
struct ggml_backend_buffer_i iface;
|
||||
|
||||
ggml_backend_t backend;
|
||||
ggml_backend_buffer_context_t context;
|
||||
|
||||
size_t size;
|
||||
};
|
||||
|
||||
// backend buffer functions
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
struct ggml_backend * backend,
|
||||
struct ggml_backend_buffer_i iface,
|
||||
ggml_backend_buffer_context_t context,
|
||||
size_t size);
|
||||
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
|
||||
//
|
||||
// backend
|
||||
//
|
||||
|
||||
struct ggml_backend_i {
|
||||
const char * (*get_name)(ggml_backend_t backend);
|
||||
|
||||
void (*free)(ggml_backend_t backend);
|
||||
|
||||
// buffer allocation
|
||||
ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
|
||||
|
||||
// get buffer alignment
|
||||
size_t (*get_alignment)(ggml_backend_t backend);
|
||||
|
||||
// tensor data access
|
||||
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
|
||||
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
void (*synchronize) (ggml_backend_t backend);
|
||||
|
||||
// (optional) copy tensor between different backends, allow for single-copy tranfers
|
||||
void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// compute graph with a plan
|
||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
|
||||
// compute graph without a plan
|
||||
void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
|
||||
// check if the backend supports an operation
|
||||
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
};
|
||||
|
||||
// TODO: hide behind API
|
||||
struct ggml_backend {
|
||||
struct ggml_backend_i iface;
|
||||
|
||||
ggml_backend_context_t context;
|
||||
};
|
||||
|
||||
// backend helper functions
|
||||
GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
||||
|
||||
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_tensor_set_async( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
|
||||
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
|
||||
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
|
||||
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// tensor copy between different backends
|
||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
||||
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
535
ggml-cuda.cu
535
ggml-cuda.cu
@@ -62,6 +62,7 @@
|
||||
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||
#define cudaMemcpyKind hipMemcpyKind
|
||||
#define cudaMemset hipMemset
|
||||
#define cudaMemsetAsync hipMemsetAsync
|
||||
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
||||
#define cudaSetDevice hipSetDevice
|
||||
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||
@@ -419,6 +420,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
||||
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
||||
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||
#define CUDA_GET_ROWS_BLOCK_SIZE 256
|
||||
|
||||
// dmmv = dequantize_mul_mat_vec
|
||||
#ifndef GGML_CUDA_DMMV_X
|
||||
@@ -1574,6 +1576,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
||||
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||
}
|
||||
|
||||
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
|
||||
const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
||||
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
|
||||
if (col >= ncols) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int r = y[row];
|
||||
|
||||
// copy x[r*ncols + col] to dst[row*ncols + col]
|
||||
const int xi = r*ncols + col;
|
||||
const int di = row*ncols + col;
|
||||
|
||||
const int ib = xi/qk; // block index
|
||||
const int iqs = (xi%qk)/qr; // quant index
|
||||
const int iybs = di - di%qk; // y block start index
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
// dequantize
|
||||
dfloat2 v;
|
||||
dequantize_kernel(x, ib, iqs, v);
|
||||
|
||||
dst[iybs + iqs + 0] = v.x;
|
||||
dst[iybs + iqs + y_offset] = v.y;
|
||||
}
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
||||
@@ -4555,6 +4585,15 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
||||
dst[i] = scale * x[i];
|
||||
}
|
||||
|
||||
|
||||
template<int qk, int qr, dequantize_kernel_t dq>
|
||||
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
|
||||
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
||||
const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
||||
const dim3 block_nums(block_num_x, nrows, 1);
|
||||
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
|
||||
}
|
||||
|
||||
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
||||
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
||||
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
||||
@@ -5703,7 +5742,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
||||
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
||||
kind = cudaMemcpyDeviceToDevice;
|
||||
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||
int id;
|
||||
CUDA_CHECK(cudaGetDevice(&id));
|
||||
src_ptr = (char *) extra->data_device[id];
|
||||
@@ -5739,6 +5778,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_repeat(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
||||
// guaranteed to be an integer due to the check in ggml_can_repeat
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
const int64_t ne1 = dst->ne[1];
|
||||
const int64_t ne2 = dst->ne[2];
|
||||
const int64_t ne3 = dst->ne[3];
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
|
||||
const size_t nb0 = dst->nb[0];
|
||||
const size_t nb1 = dst->nb[1];
|
||||
const size_t nb2 = dst->nb[2];
|
||||
const size_t nb3 = dst->nb[3];
|
||||
|
||||
const size_t nb00 = src0->nb[0];
|
||||
const size_t nb01 = src0->nb[1];
|
||||
const size_t nb02 = src0->nb[2];
|
||||
const size_t nb03 = src0->nb[3];
|
||||
|
||||
const int nr0 = (int)(ne0/ne00);
|
||||
const int nr1 = (int)(ne1/ne01);
|
||||
const int nr2 = (int)(ne2/ne02);
|
||||
const int nr3 = (int)(ne3/ne03);
|
||||
|
||||
// TODO: support for transposed / permuted tensors
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
|
||||
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
|
||||
for (int i3 = 0; i3 < nr3; i3++) {
|
||||
for (int k3 = 0; k3 < ne03; k3++) {
|
||||
for (int i2 = 0; i2 < nr2; i2++) {
|
||||
for (int k2 = 0; k2 < ne02; k2++) {
|
||||
for (int i1 = 0; i1 < nr1; i1++) {
|
||||
for (int k1 = 0; k1 < ne01; k1++) {
|
||||
for (int i0 = 0; i0 < nr0; i0++) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
(char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
||||
(const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
||||
ne00*nb0, cudaMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(void) src1;
|
||||
(void) src1_d;
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_get_rows(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
||||
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
const int ncols = src0->ne[0];
|
||||
const int nrows = ggml_nelements(src1);
|
||||
|
||||
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
||||
break;
|
||||
case GGML_TYPE_F32:
|
||||
get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
||||
break;
|
||||
default:
|
||||
// TODO: k-quants
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
inline void ggml_cuda_op_add(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
||||
@@ -6343,7 +6483,14 @@ inline void ggml_cuda_op_scale(
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const float scale = ((float *) src1->data)[0];
|
||||
float scale;
|
||||
// HACK: support for ggml backend interface
|
||||
if (src1->backend == GGML_BACKEND_CPU) {
|
||||
scale = ((float *) src1->data)[0];
|
||||
} else {
|
||||
// TODO: pass pointer to kernel instead of copying to host
|
||||
CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
@@ -6362,9 +6509,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
||||
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
|
||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
||||
@@ -6505,9 +6652,9 @@ static void ggml_cuda_op_mul_mat(
|
||||
const size_t q8_1_ts = sizeof(block_q8_1);
|
||||
const size_t q8_1_bs = QK8_1;
|
||||
|
||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
||||
@@ -6585,7 +6732,7 @@ static void ggml_cuda_op_mul_mat(
|
||||
if (convert_src1_to_q8_1) {
|
||||
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
||||
|
||||
if (split && src1_on_device && src1_is_contiguous) {
|
||||
if (src1_on_device && src1_is_contiguous) {
|
||||
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
@@ -6667,7 +6814,7 @@ static void ggml_cuda_op_mul_mat(
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
|
||||
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
|
||||
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
@@ -6758,6 +6905,14 @@ static void ggml_cuda_op_mul_mat(
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
|
||||
}
|
||||
|
||||
static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
|
||||
}
|
||||
|
||||
static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
||||
}
|
||||
@@ -6812,13 +6967,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||
|
||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
void * src0_ddq = src0_extra->data_device[g_main_device];
|
||||
|
||||
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
||||
|
||||
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
||||
|
||||
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
||||
@@ -6843,13 +6998,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||
|
||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
void * src0_ddq = src0_extra->data_device[g_main_device];
|
||||
|
||||
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
||||
|
||||
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
||||
|
||||
const int64_t row_stride_x = nb01 / sizeof(half);
|
||||
@@ -6870,11 +7025,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||
}
|
||||
}
|
||||
|
||||
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
||||
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||
}else if (src0->type == GGML_TYPE_F32) {
|
||||
} else if (src0->type == GGML_TYPE_F32) {
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
||||
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
||||
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
||||
@@ -6935,8 +7090,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||
|
||||
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
||||
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
||||
@@ -6991,8 +7146,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
||||
|
||||
const size_t nb1 = tensor->nb[1];
|
||||
|
||||
ggml_backend backend = tensor->backend;
|
||||
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
||||
ggml_backend_type backend = tensor->backend;
|
||||
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
@@ -7046,7 +7201,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
||||
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
||||
}
|
||||
|
||||
|
||||
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
||||
|
||||
extra->data_device[id] = buf;
|
||||
@@ -7085,17 +7239,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
||||
delete extra;
|
||||
}
|
||||
|
||||
static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
||||
static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
||||
static size_t g_temp_tensor_extra_index = 0;
|
||||
|
||||
static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
||||
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
||||
if (g_temp_tensor_extras == nullptr) {
|
||||
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
||||
}
|
||||
|
||||
size_t alloc_index = g_temp_tensor_extra_index;
|
||||
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
||||
struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
||||
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
|
||||
return extra;
|
||||
@@ -7123,7 +7277,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
||||
return;
|
||||
}
|
||||
|
||||
struct ggml_tensor_extra_gpu * extra;
|
||||
ggml_tensor_extra_gpu * extra;
|
||||
|
||||
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
||||
tensor->op == GGML_OP_VIEW ||
|
||||
@@ -7132,7 +7286,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
||||
|
||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
||||
size_t offset = 0;
|
||||
if (tensor->op == GGML_OP_VIEW) {
|
||||
@@ -7141,7 +7295,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
||||
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||
extra->data_device[g_main_device] = src0_ddc + offset;
|
||||
} else if (tensor->op == GGML_OP_CPY) {
|
||||
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
||||
void * src1_ddv = src1_extra->data_device[g_main_device];
|
||||
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||
extra->data_device[g_main_device] = src1_ddv;
|
||||
@@ -7183,13 +7337,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
||||
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
||||
}
|
||||
|
||||
struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||
|
||||
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
||||
tensor->op == GGML_OP_VIEW;
|
||||
|
||||
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
||||
size_t view_offset = 0;
|
||||
if (tensor->op == GGML_OP_VIEW) {
|
||||
@@ -7207,7 +7361,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor));
|
||||
|
||||
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
||||
}
|
||||
@@ -7264,58 +7418,47 @@ void ggml_cuda_free_scratch() {
|
||||
g_scratch_buffer = nullptr;
|
||||
}
|
||||
|
||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||
ggml_cuda_func_t func;
|
||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
||||
|
||||
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
|
||||
return false;
|
||||
}
|
||||
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_REPEAT:
|
||||
func = ggml_cuda_repeat;
|
||||
break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
func = ggml_cuda_get_rows;
|
||||
break;
|
||||
case GGML_OP_DUP:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_dup;
|
||||
break;
|
||||
case GGML_OP_ADD:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_add;
|
||||
break;
|
||||
case GGML_OP_MUL:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_mul;
|
||||
break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(tensor)) {
|
||||
case GGML_UNARY_OP_GELU:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_gelu;
|
||||
break;
|
||||
case GGML_UNARY_OP_SILU:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_silu;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_NORM:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_norm;
|
||||
break;
|
||||
case GGML_OP_RMS_NORM:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_rms_norm;
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
@@ -7325,54 +7468,30 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||
func = ggml_cuda_mul_mat;
|
||||
break;
|
||||
case GGML_OP_SCALE:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_scale;
|
||||
break;
|
||||
case GGML_OP_CPY:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_cpy;
|
||||
break;
|
||||
case GGML_OP_CONT:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_dup;
|
||||
break;
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_nop;
|
||||
break;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_diag_mask_inf;
|
||||
break;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_soft_max;
|
||||
break;
|
||||
case GGML_OP_ROPE:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_rope;
|
||||
break;
|
||||
case GGML_OP_ALIBI:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_alibi;
|
||||
break;
|
||||
default:
|
||||
@@ -7400,3 +7519,263 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
||||
snprintf(description, description_size, "%s", prop.name);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// backend interface
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
struct ggml_backend_context_cuda {
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
||||
return GGML_CUDA_NAME;
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||
delete cuda_ctx;
|
||||
delete backend;
|
||||
}
|
||||
|
||||
struct ggml_backend_buffer_context_cuda {
|
||||
void * device;
|
||||
|
||||
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
||||
size_t temp_tensor_extra_index = 0;
|
||||
|
||||
~ggml_backend_buffer_context_cuda() {
|
||||
delete[] temp_tensor_extras;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
||||
if (temp_tensor_extras == nullptr) {
|
||||
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
||||
}
|
||||
|
||||
size_t alloc_index = temp_tensor_extra_index;
|
||||
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
||||
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
|
||||
return extra;
|
||||
}
|
||||
};
|
||||
|
||||
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||
CUDA_CHECK(cudaFree(ctx->device));
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||
return ctx->device;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
int64_t row_low = 0;
|
||||
int64_t row_high = ggml_nrows(tensor);
|
||||
int64_t nrows_split = row_high - row_low;
|
||||
|
||||
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
||||
|
||||
int64_t ne0 = tensor->ne[0];
|
||||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
||||
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
||||
}
|
||||
}
|
||||
|
||||
return size;
|
||||
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||
|
||||
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
||||
assert(tensor->view_src->buffer->backend == buffer->backend);
|
||||
tensor->backend = tensor->view_src->backend;
|
||||
tensor->extra = tensor->view_src->extra;
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
|
||||
|
||||
extra->data_device[g_main_device] = tensor->data;
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
tensor->extra = extra;
|
||||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
// initialize padding to 0 to avoid possible NaN values
|
||||
int64_t row_low = 0;
|
||||
int64_t row_high = ggml_nrows(tensor);
|
||||
int64_t nrows_split = row_high - row_low;
|
||||
|
||||
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
||||
size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
|
||||
|
||||
if (padded_size > original_size && tensor->view_src == nullptr) {
|
||||
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
|
||||
}
|
||||
}
|
||||
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
||||
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
||||
/* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
|
||||
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
||||
/* .free_tensor = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
|
||||
ggml_cuda_set_device(g_main_device);
|
||||
|
||||
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
|
||||
CUDA_CHECK(cudaMalloc(&ctx->device, size));
|
||||
return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
|
||||
return 128;
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
GGML_ASSERT(!"not implemented");
|
||||
|
||||
return nullptr;
|
||||
|
||||
UNUSED(backend);
|
||||
UNUSED(cgraph);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
GGML_ASSERT(!"not implemented");
|
||||
|
||||
UNUSED(backend);
|
||||
UNUSED(plan);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
GGML_ASSERT(!"not implemented");
|
||||
|
||||
UNUSED(backend);
|
||||
UNUSED(plan);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_cuda_set_device(g_main_device);
|
||||
|
||||
ggml_compute_params params = {};
|
||||
params.type = GGML_TASK_COMPUTE;
|
||||
params.ith = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
assert(node->backend == GGML_BACKEND_GPU);
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
if (node->src[j] != nullptr) {
|
||||
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
||||
}
|
||||
}
|
||||
|
||||
bool ok = ggml_cuda_compute_forward(¶ms, node);
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
||||
}
|
||||
GGML_ASSERT(ok);
|
||||
|
||||
#if 0
|
||||
if (node->type == GGML_TYPE_F32) {
|
||||
cudaDeviceSynchronize();
|
||||
std::vector<float> tmp(ggml_nelements(node), 0.0f);
|
||||
cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
|
||||
printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
|
||||
ggml_type_name(node->src[0]->type),
|
||||
node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
|
||||
node->src[0]->name,
|
||||
node->src[1] ? node->src[1]->name : "none");
|
||||
double sum = 0.0;
|
||||
double sq_sum = 0.0;
|
||||
for (int i = 0; i < ggml_nelements(node); i++) {
|
||||
printf("%f ", tmp[i]);
|
||||
sum += tmp[i];
|
||||
sq_sum += tmp[i]*tmp[i];
|
||||
}
|
||||
printf("\n");
|
||||
printf("sum: %f, ", sum);
|
||||
printf("sq_sum: %f\n", sq_sum);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static ggml_backend_i cuda_backend_i = {
|
||||
/* .get_name = */ ggml_backend_cuda_name,
|
||||
/* .free = */ ggml_backend_cuda_free,
|
||||
/* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cuda_get_alignment,
|
||||
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
||||
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
||||
/* .cpy_tensor_from = */ nullptr,
|
||||
/* .cpy_tensor_to = */ nullptr,
|
||||
/* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
|
||||
/* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
|
||||
/* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
|
||||
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
||||
/* .supports_op = */ nullptr,
|
||||
};
|
||||
|
||||
ggml_backend_t ggml_backend_cuda_init() {
|
||||
ggml_init_cublas(); // TODO: remove from ggml.c
|
||||
|
||||
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
|
||||
|
||||
ggml_backend_t cuda_backend = new ggml_backend {
|
||||
/* .interface = */ cuda_backend_i,
|
||||
/* .context = */ ctx
|
||||
};
|
||||
|
||||
return cuda_backend;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef GGML_USE_HIPBLAS
|
||||
#define GGML_CUDA_NAME "ROCm"
|
||||
@@ -42,6 +43,9 @@ GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, s
|
||||
GGML_API int ggml_cuda_get_device_count(void);
|
||||
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
|
||||
// backend API
|
||||
GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
19
ggml-metal.h
19
ggml-metal.h
@@ -20,6 +20,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
@@ -35,10 +36,15 @@ struct ggml_cgraph;
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
||||
//
|
||||
// internal API
|
||||
// temporary exposed to user-code
|
||||
//
|
||||
|
||||
struct ggml_metal_context;
|
||||
|
||||
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
// number of command buffers to use
|
||||
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
||||
void ggml_metal_free(struct ggml_metal_context * ctx);
|
||||
@@ -83,6 +89,17 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
|
||||
// creates gf->n_threads command buffers in parallel
|
||||
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
||||
|
||||
//
|
||||
// backend API
|
||||
// user-code should use only these functions
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
358
ggml-metal.m
358
ggml-metal.m
@@ -81,18 +81,18 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||
GGML_METAL_DECL_KERNEL(norm);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
||||
@@ -262,28 +262,30 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||
GGML_METAL_ADD_KERNEL(norm);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
|
||||
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
|
||||
}
|
||||
GGML_METAL_ADD_KERNEL(rope_f32);
|
||||
GGML_METAL_ADD_KERNEL(rope_f16);
|
||||
GGML_METAL_ADD_KERNEL(alibi_f32);
|
||||
@@ -296,8 +298,21 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
#undef GGML_METAL_ADD_KERNEL
|
||||
}
|
||||
|
||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
#if TARGET_OS_OSX
|
||||
// print MTL GPU family:
|
||||
GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);
|
||||
|
||||
// determine max supported GPU family
|
||||
// https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
|
||||
// https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||
for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
|
||||
if ([ctx->device supportsFamily:i]) {
|
||||
GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
if (ctx->device.maxTransferRate != 0) {
|
||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||
@@ -339,28 +354,30 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_DEL_KERNEL(rms_norm);
|
||||
GGML_METAL_DEL_KERNEL(norm);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
|
||||
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
|
||||
}
|
||||
GGML_METAL_DEL_KERNEL(rope_f32);
|
||||
GGML_METAL_DEL_KERNEL(rope_f16);
|
||||
GGML_METAL_DEL_KERNEL(alibi_f32);
|
||||
@@ -762,8 +779,8 @@ void ggml_metal_graph_compute(
|
||||
} break;
|
||||
case GGML_OP_CONCAT:
|
||||
{
|
||||
const int64_t nb = ne00;
|
||||
|
||||
int64_t nb = ne00;
|
||||
[encoder setComputePipelineState:ctx->pipeline_concat];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
@@ -795,6 +812,7 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_ADD:
|
||||
@@ -892,9 +910,10 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
||||
|
||||
const int64_t n = ggml_nelements(dst)/4;
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
GGML_ASSERT(n % 4 == 0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(gf->nodes[i])) {
|
||||
@@ -904,9 +923,10 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst)/4;
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
GGML_ASSERT(n % 4 == 0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_RELU:
|
||||
{
|
||||
@@ -924,9 +944,10 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst)/4;
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
GGML_ASSERT(n % 4 == 0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -986,21 +1007,46 @@ void ggml_metal_graph_compute(
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
|
||||
|
||||
GGML_ASSERT(ne00 == ne10);
|
||||
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
|
||||
uint gqa = ne12/ne02;
|
||||
GGML_ASSERT(ne03 == ne13);
|
||||
|
||||
const uint gqa = ne12/ne02;
|
||||
|
||||
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
||||
// to the matrix-vector kernel
|
||||
int ne11_mm_min = 1;
|
||||
|
||||
#if 0
|
||||
// the numbers below are measured on M2 Ultra for 7B and 13B models
|
||||
// these numbers do not translate to other devices or model sizes
|
||||
// TODO: need to find a better approach
|
||||
if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||
case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||
case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
||||
case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
||||
case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
||||
case GGML_TYPE_Q5_0: // not tested yet
|
||||
case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
||||
case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
||||
case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||
default: ne11_mm_min = 1; break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||
if (!ggml_is_transposed(src0) &&
|
||||
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
||||
!ggml_is_transposed(src0) &&
|
||||
!ggml_is_transposed(src1) &&
|
||||
src1t == GGML_TYPE_F32 &&
|
||||
[ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
||||
ne00%32 == 0 &&
|
||||
ne11 > 2) {
|
||||
ne00 % 32 == 0 && ne00 >= 64 &&
|
||||
ne11 > ne11_mm_min) {
|
||||
//printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
|
||||
@@ -1029,17 +1075,18 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12];
|
||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:13];
|
||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
} else {
|
||||
int nth0 = 32;
|
||||
int nth1 = 1;
|
||||
int nrows = 1;
|
||||
//printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
||||
|
||||
// use custom matrix x vector kernel
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32];
|
||||
nrows = 4;
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
@@ -1047,12 +1094,12 @@ void ggml_metal_graph_compute(
|
||||
nth0 = 32;
|
||||
nth1 = 1;
|
||||
if (ne11 * ne12 < 4) {
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
|
||||
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
|
||||
nrows = ne11;
|
||||
} else {
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
|
||||
nrows = 4;
|
||||
}
|
||||
} break;
|
||||
@@ -1063,7 +1110,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
{
|
||||
@@ -1072,7 +1119,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
{
|
||||
@@ -1081,7 +1128,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q8_0_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
{
|
||||
@@ -1090,7 +1137,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
{
|
||||
@@ -1099,7 +1146,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
{
|
||||
@@ -1108,7 +1155,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 4; //1;
|
||||
nth1 = 8; //32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
{
|
||||
@@ -1117,7 +1164,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
{
|
||||
@@ -1126,7 +1173,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32];
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -1155,7 +1202,7 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
|
||||
src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
|
||||
src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q4_K) {
|
||||
@@ -1208,6 +1255,8 @@ void ggml_metal_graph_compute(
|
||||
} break;
|
||||
case GGML_OP_RMS_NORM:
|
||||
{
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
@@ -1413,3 +1462,140 @@ void ggml_metal_graph_compute(
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// backend interface
|
||||
|
||||
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
||||
return "Metal";
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_free(ggml_backend_t backend) {
|
||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
||||
ggml_metal_free(ctx);
|
||||
free(backend);
|
||||
}
|
||||
|
||||
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return (void *)buffer->context;
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
free(buffer->context);
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static struct ggml_backend_buffer_i metal_backend_buffer_i = {
|
||||
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .init_tensor = */ NULL, // no initialization required
|
||||
/* .free_tensor = */ NULL, // no cleanup required
|
||||
};
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) {
|
||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
||||
|
||||
void * data = ggml_metal_host_malloc(size);
|
||||
|
||||
// TODO: set proper name of the buffers
|
||||
ggml_metal_add_buffer(ctx, "backend", data, size, 0);
|
||||
|
||||
return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) {
|
||||
return 32;
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
|
||||
memcpy((char *)tensor->data + offset, data, size);
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
|
||||
memcpy(data, (const char *)tensor->data + offset, size);
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
||||
|
||||
ggml_metal_graph_compute(metal_ctx, cgraph);
|
||||
}
|
||||
|
||||
static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
return true;
|
||||
UNUSED(backend);
|
||||
UNUSED(op);
|
||||
}
|
||||
|
||||
static struct ggml_backend_i metal_backend_i = {
|
||||
/* .get_name = */ ggml_backend_metal_name,
|
||||
/* .free = */ ggml_backend_metal_free,
|
||||
/* .alloc_buffer = */ ggml_backend_metal_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_metal_get_alignment,
|
||||
/* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
|
||||
/* .synchronize = */ ggml_backend_metal_synchronize,
|
||||
/* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from,
|
||||
/* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to,
|
||||
/* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_metal_supports_op,
|
||||
};
|
||||
|
||||
ggml_backend_t ggml_backend_metal_init(void) {
|
||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||
|
||||
ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
||||
|
||||
ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
|
||||
|
||||
*metal_backend = (struct ggml_backend) {
|
||||
/* .interface = */ metal_backend_i,
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
return metal_backend;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_metal(ggml_backend_t backend) {
|
||||
return backend->iface.get_name == ggml_backend_metal_name;
|
||||
}
|
||||
|
||||
void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
||||
|
||||
ggml_metal_set_n_cb(ctx, n_cb);
|
||||
}
|
||||
|
||||
110
ggml-metal.metal
110
ggml-metal.metal
@@ -13,8 +13,8 @@ typedef struct {
|
||||
|
||||
#define QK4_1 32
|
||||
typedef struct {
|
||||
half d; // delta
|
||||
half m; // min
|
||||
half d; // delta
|
||||
half m; // min
|
||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||
} block_q4_1;
|
||||
|
||||
@@ -345,10 +345,11 @@ kernel void kernel_rms_norm(
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
|
||||
device const float * x_scalar = (device const float *) x;
|
||||
float4 sumf=0;
|
||||
float all_sum=0;
|
||||
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
|
||||
device const float * x_scalar = (device const float *) x;
|
||||
|
||||
float4 sumf = 0;
|
||||
float all_sum = 0;
|
||||
|
||||
// parallel sum
|
||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||
@@ -361,6 +362,7 @@ kernel void kernel_rms_norm(
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// broadcast, simd group number is ntg / 32
|
||||
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
||||
if (tpitg < i) {
|
||||
@@ -368,7 +370,9 @@ kernel void kernel_rms_norm(
|
||||
}
|
||||
}
|
||||
if (tpitg == 0) {
|
||||
for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
|
||||
for (int i = 4 * (ne00 / 4); i < ne00; i++) {
|
||||
sum[0] += x_scalar[i];
|
||||
}
|
||||
sum[0] /= ne00;
|
||||
}
|
||||
|
||||
@@ -383,7 +387,9 @@ kernel void kernel_rms_norm(
|
||||
y[i00] = x[i00] * scale;
|
||||
}
|
||||
if (tpitg == 0) {
|
||||
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
|
||||
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
|
||||
y_scalar[i00] = x_scalar[i00] * scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -423,8 +429,8 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
|
||||
}
|
||||
|
||||
// putting them in the kernel cause a significant performance penalty
|
||||
#define N_DST 4 // each SIMD group works on 4 rows
|
||||
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
||||
#define N_DST 4 // each SIMD group works on 4 rows
|
||||
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
||||
//Note: This is a template, but strictly speaking it only applies to
|
||||
// quantizations where the block size is 32. It also does not
|
||||
@@ -435,18 +441,23 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
||||
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa,
|
||||
uint3 tgpig, uint tiisg, uint sgitg) {
|
||||
const int nb = ne00/QK4_0;
|
||||
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
|
||||
const int first_row = (r0 * nsg + sgitg) * nr;
|
||||
|
||||
const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
|
||||
|
||||
device const block_q_type * x = (device const block_q_type *) src0 + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
float yl[16]; // src1 vector cache
|
||||
float sumf[nr]={0.f};
|
||||
|
||||
const int ix = tiisg/2;
|
||||
const int il = 8*(tiisg%2);
|
||||
float yl[16]; // src1 vector cache
|
||||
float sumf[nr] = {0.f};
|
||||
|
||||
const int ix = (tiisg/2);
|
||||
const int il = (tiisg%2)*8;
|
||||
|
||||
device const float * yb = y + ix * QK4_0 + il;
|
||||
|
||||
@@ -457,6 +468,7 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
||||
sumy += yb[i] + yb[i+1];
|
||||
yl[i+0] = yb[i+ 0];
|
||||
yl[i+1] = yb[i+ 1]/256.f;
|
||||
|
||||
sumy += yb[i+16] + yb[i+17];
|
||||
yl[i+8] = yb[i+16]/16.f;
|
||||
yl[i+9] = yb[i+17]/4096.f;
|
||||
@@ -472,12 +484,12 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
||||
for (int row = 0; row < nr; ++row) {
|
||||
const float tot = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && first_row + row < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
|
||||
dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_q4_0_f32(
|
||||
kernel void kernel_mul_mv_q4_0_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -490,12 +502,12 @@ kernel void kernel_mul_mat_q4_0_f32(
|
||||
constant int64_t & ne1[[buffer(16)]],
|
||||
constant uint & gqa[[buffer(17)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_q4_1_f32(
|
||||
kernel void kernel_mul_mv_q4_1_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -515,7 +527,7 @@ kernel void kernel_mul_mat_q4_1_f32(
|
||||
|
||||
#define NB_Q8_0 8
|
||||
|
||||
kernel void kernel_mul_mat_q8_0_f32(
|
||||
kernel void kernel_mul_mv_q8_0_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -579,7 +591,7 @@ kernel void kernel_mul_mat_q8_0_f32(
|
||||
|
||||
#define N_F32_F32 4
|
||||
|
||||
kernel void kernel_mul_mat_f32_f32(
|
||||
kernel void kernel_mul_mv_f32_f32(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
@@ -650,7 +662,7 @@ kernel void kernel_mul_mat_f32_f32(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_f16_f32_1row(
|
||||
kernel void kernel_mul_mv_f16_f32_1row(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
@@ -669,7 +681,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
@@ -704,7 +716,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
|
||||
|
||||
#define N_F16_F32 4
|
||||
|
||||
kernel void kernel_mul_mat_f16_f32(
|
||||
kernel void kernel_mul_mv_f16_f32(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
@@ -776,7 +788,7 @@ kernel void kernel_mul_mat_f16_f32(
|
||||
}
|
||||
|
||||
// Assumes row size (ne00) is a multiple of 4
|
||||
kernel void kernel_mul_mat_f16_f32_l4(
|
||||
kernel void kernel_mul_mv_f16_f32_l4(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
@@ -1253,7 +1265,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
|
||||
|
||||
//====================================== dot products =========================
|
||||
|
||||
kernel void kernel_mul_mat_q2_K_f32(
|
||||
kernel void kernel_mul_mv_q2_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1397,7 +1409,7 @@ kernel void kernel_mul_mat_q2_K_f32(
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
kernel void kernel_mul_mat_q3_K_f32(
|
||||
kernel void kernel_mul_mv_q3_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1549,7 +1561,7 @@ kernel void kernel_mul_mat_q3_K_f32(
|
||||
}
|
||||
}
|
||||
#else
|
||||
kernel void kernel_mul_mat_q3_K_f32(
|
||||
kernel void kernel_mul_mv_q3_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1620,7 +1632,7 @@ kernel void kernel_mul_mat_q3_K_f32(
|
||||
#endif
|
||||
|
||||
#if QK_K == 256
|
||||
kernel void kernel_mul_mat_q4_K_f32(
|
||||
kernel void kernel_mul_mv_q4_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1726,7 +1738,7 @@ kernel void kernel_mul_mat_q4_K_f32(
|
||||
}
|
||||
}
|
||||
#else
|
||||
kernel void kernel_mul_mat_q4_K_f32(
|
||||
kernel void kernel_mul_mv_q4_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1815,7 +1827,7 @@ kernel void kernel_mul_mat_q4_K_f32(
|
||||
}
|
||||
#endif
|
||||
|
||||
kernel void kernel_mul_mat_q5_K_f32(
|
||||
kernel void kernel_mul_mv_q5_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1988,7 +2000,7 @@ kernel void kernel_mul_mat_q5_K_f32(
|
||||
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_q6_K_f32(
|
||||
kernel void kernel_mul_mv_q6_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -2326,7 +2338,7 @@ kernel void kernel_get_rows(
|
||||
}
|
||||
|
||||
#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
|
||||
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A
|
||||
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
|
||||
#define BLOCK_SIZE_K 32
|
||||
#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
|
||||
#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
|
||||
@@ -2363,9 +2375,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
const uint r0 = tgpig.y;
|
||||
const uint r1 = tgpig.x;
|
||||
const uint im = tgpig.z;
|
||||
|
||||
// if this block is of 64x32 shape or smaller
|
||||
short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
|
||||
short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
|
||||
|
||||
// a thread shouldn't load data outside of the matrix
|
||||
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
||||
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
||||
@@ -2389,26 +2403,30 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
+ nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
|
||||
|
||||
for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
|
||||
//load data and store to threadgroup memory
|
||||
// load data and store to threadgroup memory
|
||||
half4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
#pragma unroll(16)
|
||||
for (int i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
|
||||
+ 16 * (tiitg % THREAD_PER_ROW) + 8 * (i / 8)) \
|
||||
+ (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
|
||||
+ (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
|
||||
+ (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
|
||||
}
|
||||
*(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) \
|
||||
= *((device float2x4 *)y);
|
||||
|
||||
*(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
|
||||
|
||||
il = (il + 2 < nl) ? il + 2 : il % 2;
|
||||
x = (il < 2) ? x + (2+nl-1)/nl : x;
|
||||
y += BLOCK_SIZE_K;
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
//load matrices from threadgroup memory and conduct outer products
|
||||
|
||||
// load matrices from threadgroup memory and conduct outer products
|
||||
threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
|
||||
threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
|
||||
#pragma unroll(4)
|
||||
@@ -2423,6 +2441,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
|
||||
lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
|
||||
lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
|
||||
|
||||
#pragma unroll(8)
|
||||
for (int i = 0; i < 8; i++){
|
||||
simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
|
||||
@@ -2431,25 +2450,26 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
}
|
||||
|
||||
if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
|
||||
device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \
|
||||
+ (BLOCK_SIZE_N * r1 + 16 * (sgitg>>1)) * ne0 + im*ne1*ne0;
|
||||
device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) \
|
||||
+ (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
|
||||
}
|
||||
} else {
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
||||
threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
|
||||
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
||||
if (sgitg==0) {
|
||||
|
||||
device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
||||
if (sgitg == 0) {
|
||||
for (int i = 0; i < n_rows; i++) {
|
||||
for (int j = tiitg; j< n_cols; j += BLOCK_SIZE_N) {
|
||||
for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
|
||||
*(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
|
||||
}
|
||||
}
|
||||
|
||||
64
ggml.c
64
ggml.c
@@ -162,40 +162,16 @@ typedef void * thread_ret_t;
|
||||
|
||||
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
||||
|
||||
//
|
||||
// end of logging block
|
||||
//
|
||||
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
// uncomment to use vDSP for soft max computation
|
||||
// note: not sure if it is actually faster
|
||||
//#define GGML_SOFT_MAX_ACCELERATE
|
||||
#endif
|
||||
|
||||
//
|
||||
// logging
|
||||
//
|
||||
|
||||
#if (GGML_DEBUG >= 1)
|
||||
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG(...)
|
||||
#endif
|
||||
|
||||
#if (GGML_DEBUG >= 5)
|
||||
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG_5(...)
|
||||
#endif
|
||||
|
||||
#if (GGML_DEBUG >= 10)
|
||||
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG_10(...)
|
||||
#endif
|
||||
|
||||
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
||||
|
||||
//
|
||||
// end of logging block
|
||||
//
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
||||
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
||||
@@ -4951,6 +4927,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||
*result = (struct ggml_tensor) {
|
||||
/*.type =*/ type,
|
||||
/*.backend =*/ GGML_BACKEND_CPU,
|
||||
/*.buffer =*/ NULL,
|
||||
/*.n_dims =*/ n_dims,
|
||||
/*.ne =*/ { 1, 1, 1, 1 },
|
||||
/*.nb =*/ { 0, 0, 0, 0 },
|
||||
@@ -11256,7 +11233,7 @@ static void ggml_compute_forward_silu_f32(
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
||||
const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
|
||||
UNUSED(x);
|
||||
assert(!isnan(x));
|
||||
assert(!isinf(x));
|
||||
@@ -13089,17 +13066,17 @@ static void ggml_compute_forward_alibi_f32(
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
||||
const int ne1 = src0->ne[1]; // seq_len_without_past
|
||||
const int ne2 = src0->ne[2]; // n_head -> this is k
|
||||
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
||||
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
||||
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
|
||||
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
|
||||
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
|
||||
|
||||
const int n = ggml_nrows(src0);
|
||||
const int ne2_ne3 = n/ne1; // ne2*ne3
|
||||
const int64_t n = ggml_nrows(src0);
|
||||
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
|
||||
|
||||
const int nb0 = src0->nb[0];
|
||||
const int nb1 = src0->nb[1];
|
||||
const int nb2 = src0->nb[2];
|
||||
const size_t nb0 = src0->nb[0];
|
||||
const size_t nb1 = src0->nb[1];
|
||||
const size_t nb2 = src0->nb[2];
|
||||
//const int nb3 = src0->nb[3];
|
||||
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
@@ -13111,9 +13088,9 @@ static void ggml_compute_forward_alibi_f32(
|
||||
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
||||
|
||||
for (int i = 0; i < ne0; i++) {
|
||||
for (int j = 0; j < ne1; j++) {
|
||||
for (int k = 0; k < ne2_ne3; k++) {
|
||||
for (int64_t i = 0; i < ne0; i++) {
|
||||
for (int64_t j = 0; j < ne1; j++) {
|
||||
for (int64_t k = 0; k < ne2_ne3; k++) {
|
||||
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
||||
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
||||
|
||||
@@ -13128,7 +13105,6 @@ static void ggml_compute_forward_alibi_f32(
|
||||
}
|
||||
|
||||
pdst[0] = i * m_k + src[0];
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -20203,6 +20179,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||
ggml_vec_cpy_f32(nx, xp, x);
|
||||
ggml_vec_cpy_f32(nx, gp, g);
|
||||
|
||||
// TODO: instead of passing &cancel here, use the return code of the linesearch
|
||||
// to determine if the optimization should be cancelled
|
||||
// this is a simple change, but not doing this atm, since I don't have a nice
|
||||
// way to test and don't want to break something with so many changes lined up
|
||||
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
|
||||
if (cancel) {
|
||||
return GGML_OPT_CANCEL;
|
||||
|
||||
16
ggml.h
16
ggml.h
@@ -326,7 +326,7 @@ extern "C" {
|
||||
GGML_TYPE_COUNT,
|
||||
};
|
||||
|
||||
enum ggml_backend {
|
||||
enum ggml_backend_type {
|
||||
GGML_BACKEND_CPU = 0,
|
||||
GGML_BACKEND_GPU = 10,
|
||||
GGML_BACKEND_GPU_SPLIT = 20,
|
||||
@@ -479,8 +479,10 @@ extern "C" {
|
||||
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
enum ggml_backend backend;
|
||||
enum ggml_type type;
|
||||
enum ggml_backend_type backend;
|
||||
|
||||
struct ggml_backend_buffer * buffer;
|
||||
|
||||
int n_dims;
|
||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||
@@ -514,7 +516,7 @@ extern "C" {
|
||||
|
||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||
|
||||
char padding[4];
|
||||
char padding[12];
|
||||
};
|
||||
|
||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||
@@ -1358,7 +1360,7 @@ extern "C" {
|
||||
|
||||
// alibi position embedding
|
||||
// in-place, returns view(a)
|
||||
struct ggml_tensor * ggml_alibi(
|
||||
GGML_API struct ggml_tensor * ggml_alibi(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
@@ -1367,7 +1369,7 @@ extern "C" {
|
||||
|
||||
// clamp
|
||||
// in-place, returns view(a)
|
||||
struct ggml_tensor * ggml_clamp(
|
||||
GGML_API struct ggml_tensor * ggml_clamp(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
float min,
|
||||
@@ -2102,7 +2104,7 @@ extern "C" {
|
||||
enum ggml_type vec_dot_type;
|
||||
} ggml_type_traits_t;
|
||||
|
||||
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
||||
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
10
k_quants.h
10
k_quants.h
@@ -29,7 +29,7 @@
|
||||
|
||||
// 2-bit quantization
|
||||
// weight is represented as x = a * q + b
|
||||
// 16 blocks of 16 elemenets each
|
||||
// 16 blocks of 16 elements each
|
||||
// Effectively 2.5625 bits per weight
|
||||
typedef struct {
|
||||
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
||||
@@ -41,7 +41,7 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
|
||||
|
||||
// 3-bit quantization
|
||||
// weight is represented as x = a * q
|
||||
// 16 blocks of 16 elemenets each
|
||||
// 16 blocks of 16 elements each
|
||||
// Effectively 3.4375 bits per weight
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
@@ -62,7 +62,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 +
|
||||
#endif
|
||||
|
||||
// 4-bit quantization
|
||||
// 16 blocks of 32 elements each
|
||||
// 8 blocks of 32 elements each
|
||||
// weight is represented as x = a * q + b
|
||||
// Effectively 4.5 bits per weight
|
||||
#ifdef GGML_QKK_64
|
||||
@@ -83,7 +83,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
|
||||
#endif
|
||||
|
||||
// 5-bit quantization
|
||||
// 16 blocks of 32 elements each
|
||||
// 8 blocks of 32 elements each
|
||||
// weight is represented as x = a * q + b
|
||||
// Effectively 5.5 bits per weight
|
||||
#ifdef GGML_QKK_64
|
||||
@@ -107,7 +107,7 @@ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
|
||||
|
||||
// 6-bit quantization
|
||||
// weight is represented as x = a * q
|
||||
// 16 blocks of 16 elemenets each
|
||||
// 16 blocks of 16 elements each
|
||||
// Effectively 6.5625 bits per weight
|
||||
typedef struct {
|
||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||
|
||||
52
llama.cpp
52
llama.cpp
@@ -1325,7 +1325,11 @@ static bool llama_kv_cache_init(
|
||||
cache.cells.clear();
|
||||
cache.cells.resize(n_ctx);
|
||||
|
||||
// TODO: this should be:
|
||||
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
||||
// change it and test that it works
|
||||
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
||||
memset(cache.buf.data, 0, cache.buf.size);
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = cache.buf.size;
|
||||
@@ -1730,7 +1734,7 @@ struct llama_model_loader {
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) {
|
||||
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
|
||||
if (backend != GGML_BACKEND_CPU) {
|
||||
ggml_set_no_alloc(ctx, true);
|
||||
}
|
||||
@@ -1748,7 +1752,7 @@ struct llama_model_loader {
|
||||
return tensor;
|
||||
}
|
||||
|
||||
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend backend) {
|
||||
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
||||
|
||||
if (cur == NULL) {
|
||||
@@ -2047,7 +2051,7 @@ static void llm_load_hparams(
|
||||
case 36: model.type = e_model::MODEL_8B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_REFACT:
|
||||
{
|
||||
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
||||
@@ -2299,8 +2303,8 @@ static void llm_load_tensors(
|
||||
|
||||
// output
|
||||
{
|
||||
ggml_backend backend_norm;
|
||||
ggml_backend backend_output;
|
||||
ggml_backend_type backend_norm;
|
||||
ggml_backend_type backend_output;
|
||||
|
||||
if (n_gpu_layers > int(n_layer)) {
|
||||
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
||||
@@ -2335,8 +2339,8 @@ static void llm_load_tensors(
|
||||
model.layers.resize(n_layer);
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
@@ -2365,8 +2369,8 @@ static void llm_load_tensors(
|
||||
{
|
||||
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||
{
|
||||
ggml_backend backend_norm;
|
||||
ggml_backend backend_output;
|
||||
ggml_backend_type backend_norm;
|
||||
ggml_backend_type backend_output;
|
||||
|
||||
if (n_gpu_layers > int(n_layer)) {
|
||||
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
||||
@@ -2401,8 +2405,8 @@ static void llm_load_tensors(
|
||||
model.layers.resize(n_layer);
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
@@ -2435,8 +2439,8 @@ static void llm_load_tensors(
|
||||
|
||||
// output
|
||||
{
|
||||
ggml_backend backend_norm;
|
||||
ggml_backend backend_output;
|
||||
ggml_backend_type backend_norm;
|
||||
ggml_backend_type backend_output;
|
||||
|
||||
if (n_gpu_layers > int(n_layer)) {
|
||||
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
||||
@@ -2473,8 +2477,8 @@ static void llm_load_tensors(
|
||||
model.layers.resize(n_layer);
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
@@ -2512,8 +2516,8 @@ static void llm_load_tensors(
|
||||
|
||||
// output
|
||||
{
|
||||
ggml_backend backend_norm;
|
||||
ggml_backend backend_output;
|
||||
ggml_backend_type backend_norm;
|
||||
ggml_backend_type backend_output;
|
||||
|
||||
if (n_gpu_layers > int(n_layer)) {
|
||||
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
||||
@@ -2550,8 +2554,8 @@ static void llm_load_tensors(
|
||||
model.layers.resize(n_layer);
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
@@ -2589,8 +2593,8 @@ static void llm_load_tensors(
|
||||
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||
|
||||
{
|
||||
ggml_backend backend_norm;
|
||||
ggml_backend backend_output;
|
||||
ggml_backend_type backend_norm;
|
||||
ggml_backend_type backend_output;
|
||||
|
||||
if (n_gpu_layers > int(n_layer)) {
|
||||
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
||||
@@ -2624,8 +2628,8 @@ static void llm_load_tensors(
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
model.layers.resize(n_layer);
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
|
||||
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
|
||||
auto & layer = model.layers[i];
|
||||
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
||||
@@ -4926,7 +4930,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
case LLM_ARCH_PERSIMMON:
|
||||
{
|
||||
result = llm_build_persimmon(lctx, batch);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_REFACT:
|
||||
{
|
||||
result = llm_build_refact(lctx, batch);
|
||||
|
||||
@@ -1,16 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
cp -rpv ../ggml/src/ggml.c ./ggml.c
|
||||
cp -rpv ../ggml/src/ggml-alloc.c ./ggml-alloc.c
|
||||
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
|
||||
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
|
||||
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
||||
cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
|
||||
cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h
|
||||
cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
|
||||
cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
|
||||
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
|
||||
cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h
|
||||
cp -rpv ../ggml/src/ggml.c ./ggml.c
|
||||
cp -rpv ../ggml/src/ggml-alloc.c ./ggml-alloc.c
|
||||
cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c
|
||||
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
|
||||
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
|
||||
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
||||
cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
|
||||
cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h
|
||||
cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
|
||||
cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
|
||||
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
|
||||
cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h
|
||||
cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
|
||||
|
||||
cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp
|
||||
cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp
|
||||
|
||||
Reference in New Issue
Block a user